python爬虫篇1——爬取中英文论文文献数据

程序运行截图:php

mysql代码:python

CREATE TABLE `article` (
  `id` int(11) NOT NULL,
  `article_time` varchar(50) DEFAULT NULL,
  `article_volume` varchar(20) DEFAULT NULL,
  `article_author` varchar(2000) DEFAULT NULL,
  `article_name_english` varchar(2000) DEFAULT NULL,
  `article_name_chinese` varchar(2000) DEFAULT NULL,
  `article_content_english` varchar(5000) DEFAULT NULL,
  `article_content_chinese` varchar(2000) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8

python代码:mysql

import random
import re
import requests
import pymysql

# 打开数据库链接
db = pymysql.connect(host='localhost',
                     port=8080,
                     user='root',
                     passwd='123',
                     db='students',
                     charset='utf8')
# 使用 cursor() 方法建立一个游标对象 cursor
cursor = db.cursor()


# 功能:获取历年的论文文献名中英文,做者名,摘要中英文,时间

# 翻译接口 parm: content is english
def translator_chinese(content):
    """英文翻译成中文"""
    Tranlator_URL = "http://fy.iciba.com/ajax.php?a=fy&f=en&t=zh-CHS&w=%s" % ('"' + content + '"')
    urls = re.findall(r'"out":"(.*?)","ci', requests.get(Tranlator_URL).text, re.S)
    if len(urls) > 0:
        result = (urls[0].encode('ascii').decode('unicode_escape')).replace("“", "").replace("”", "")
        return result
    else:
        return ""


# for test
# print(translator_chinese(" therefore, be treated as a unity of contradictions."))

#经过年份获取数据
def get_data(year):
    """经过年份获取文献卷宗"""
    JZ_URL = "https://journals.sagepub.com/loi/oss?year=%i" % year
    respose = requests.get(JZ_URL)
    print("*" * 300)
    print("开始爬取%s年的文献数据!" % year)
    # 获取卷宗
    jz = (re.findall(r'class="expander".*?data-attr-vol="(.*?)"', respose.text, re.S))[1]
    print("卷宗:" + jz)
    # 获取文献url
    article_ml = re.findall(r'class="row js_issue".*?href="(.*?)"', respose.text, re.S)
    print("文献目录地址:")
    for i in range(0, len(article_ml)):
        print(str(i + 1) + "." + article_ml[i])
    print("*" * 300)
    for temp in article_ml:
        data = requests.get(temp)
        article_time = re.findall(r'<div class="journalNavTitle">\n(.*?)\n</div>', data.text, re.S)
        # 获取文献时间
        time = article_time[0][article_time[0].index(",") + 1:len(article_time[0])]
        print("文献时间:" + time)
        # 获取文献地址
        addr = re.findall(r'class="ref nowrap" href="(.*?)"', data.text, re.S)
        Basic_URL = "https://journals.sagepub.com"
        print("文献列表地址:")
        for lb in range(0, len(addr)):
            print(str(lb + 1) + "." + addr[lb])
        for ad in addr:
            # 获取每一个文献内容
            print("*" * 300)
            article_data = requests.get(Basic_URL + ad)
            article_c = re.findall(r'property="og:title" content="(.*?)"', article_data.text, re.S)
            if len(article_c) > 0:
                if "-" in article_c[0]:
                    # 获取文献做者
                    article_author = article_c[0][article_c[0].index("-") + 1:len(article_c[0])]
                    # 获取文献名
                    article_name_english = article_c[0][0:article_c[0].index("-")]
                    article_name_chinese = translator_chinese(article_name_english)
                    print("文献英文名字:" + article_name_english)
                    print("文献中文名字:" + article_name_chinese)
                    print("做者名字:" + article_author)
                else:
                    article_author = ""
                    article_name_english = article_c[0]
                    article_name_chinese = translator_chinese(article_name_english)
                    print("文献英文名字:" + article_name_english)
                    print("文献中文名字:" + article_name_chinese)
                    print("做者名字:" + article_author)
            else:
                break
            # 获取文献摘要
            article_content_data = re.findall(r'<div class="abstractSection abstractInFull"><p>(.*?)</p>',
                                              article_data.text, re.S)
            if len(article_content_data) > 0:
                article_content_english = article_content_data[0]
                article_content_chinese = translator_chinese(article_content_data[0])
                print("英文摘要:" + article_content_english)  # 英文摘要
                print("中文摘要:" + article_content_chinese)  # 中文摘要
            else:
                article_content_english = ""
                article_content_chinese = ""  # 中英文摘要都为空
                print("英文摘要:" + article_content_english)  # 英文摘要
                print("中文摘要:" + article_content_chinese)  # 中文摘要

            # 数据写入数据库
            id = random.randint(0, 999999999)
            sql = """insert into article(id,article_time,article_volume,article_author,article_name_english,article_name_chinese,
            article_content_english,article_content_chinese) values(%i,%s,%s,%s,%s,%s,%s,%s) """ % (
                id, "'" + time + "'", "'" + jz + "'", "'" + pymysql.escape_string(article_author) + "'",
                "'" + pymysql.escape_string(article_name_english) + "'",
                "'" + pymysql.escape_string(article_name_chinese) + "'",
                "'" + pymysql.escape_string(article_content_english) + "'",
                "'" + pymysql.escape_string(article_content_chinese) + "'")
            cursor.execute(sql)
            # # 提交到数据库执行
            print("id:%i数据爬取成功!" % id)
            db.commit()


# 主函数
if __name__ == '__main__':
    for year in range(2015, 2017):
        get_data(year)
    else:
        print("数据爬取完成!")
        db.close()

程序可能存在部分bug,欢迎交流指正。ajax