程序运行截图:php
mysql代码:python
CREATE TABLE `article` ( `id` int(11) NOT NULL, `article_time` varchar(50) DEFAULT NULL, `article_volume` varchar(20) DEFAULT NULL, `article_author` varchar(2000) DEFAULT NULL, `article_name_english` varchar(2000) DEFAULT NULL, `article_name_chinese` varchar(2000) DEFAULT NULL, `article_content_english` varchar(5000) DEFAULT NULL, `article_content_chinese` varchar(2000) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8
python代码:mysql
import random import re import requests import pymysql # 打开数据库链接 db = pymysql.connect(host='localhost', port=8080, user='root', passwd='123', db='students', charset='utf8') # 使用 cursor() 方法建立一个游标对象 cursor cursor = db.cursor() # 功能:获取历年的论文文献名中英文,做者名,摘要中英文,时间 # 翻译接口 parm: content is english def translator_chinese(content): """英文翻译成中文""" Tranlator_URL = "http://fy.iciba.com/ajax.php?a=fy&f=en&t=zh-CHS&w=%s" % ('"' + content + '"') urls = re.findall(r'"out":"(.*?)","ci', requests.get(Tranlator_URL).text, re.S) if len(urls) > 0: result = (urls[0].encode('ascii').decode('unicode_escape')).replace("“", "").replace("”", "") return result else: return "" # for test # print(translator_chinese(" therefore, be treated as a unity of contradictions.")) #经过年份获取数据 def get_data(year): """经过年份获取文献卷宗""" JZ_URL = "https://journals.sagepub.com/loi/oss?year=%i" % year respose = requests.get(JZ_URL) print("*" * 300) print("开始爬取%s年的文献数据!" % year) # 获取卷宗 jz = (re.findall(r'class="expander".*?data-attr-vol="(.*?)"', respose.text, re.S))[1] print("卷宗:" + jz) # 获取文献url article_ml = re.findall(r'class="row js_issue".*?href="(.*?)"', respose.text, re.S) print("文献目录地址:") for i in range(0, len(article_ml)): print(str(i + 1) + "." + article_ml[i]) print("*" * 300) for temp in article_ml: data = requests.get(temp) article_time = re.findall(r'<div class="journalNavTitle">\n(.*?)\n</div>', data.text, re.S) # 获取文献时间 time = article_time[0][article_time[0].index(",") + 1:len(article_time[0])] print("文献时间:" + time) # 获取文献地址 addr = re.findall(r'class="ref nowrap" href="(.*?)"', data.text, re.S) Basic_URL = "https://journals.sagepub.com" print("文献列表地址:") for lb in range(0, len(addr)): print(str(lb + 1) + "." + addr[lb]) for ad in addr: # 获取每一个文献内容 print("*" * 300) article_data = requests.get(Basic_URL + ad) article_c = re.findall(r'property="og:title" content="(.*?)"', article_data.text, re.S) if len(article_c) > 0: if "-" in article_c[0]: # 获取文献做者 article_author = article_c[0][article_c[0].index("-") + 1:len(article_c[0])] # 获取文献名 article_name_english = article_c[0][0:article_c[0].index("-")] article_name_chinese = translator_chinese(article_name_english) print("文献英文名字:" + article_name_english) print("文献中文名字:" + article_name_chinese) print("做者名字:" + article_author) else: article_author = "" article_name_english = article_c[0] article_name_chinese = translator_chinese(article_name_english) print("文献英文名字:" + article_name_english) print("文献中文名字:" + article_name_chinese) print("做者名字:" + article_author) else: break # 获取文献摘要 article_content_data = re.findall(r'<div class="abstractSection abstractInFull"><p>(.*?)</p>', article_data.text, re.S) if len(article_content_data) > 0: article_content_english = article_content_data[0] article_content_chinese = translator_chinese(article_content_data[0]) print("英文摘要:" + article_content_english) # 英文摘要 print("中文摘要:" + article_content_chinese) # 中文摘要 else: article_content_english = "" article_content_chinese = "" # 中英文摘要都为空 print("英文摘要:" + article_content_english) # 英文摘要 print("中文摘要:" + article_content_chinese) # 中文摘要 # 数据写入数据库 id = random.randint(0, 999999999) sql = """insert into article(id,article_time,article_volume,article_author,article_name_english,article_name_chinese, article_content_english,article_content_chinese) values(%i,%s,%s,%s,%s,%s,%s,%s) """ % ( id, "'" + time + "'", "'" + jz + "'", "'" + pymysql.escape_string(article_author) + "'", "'" + pymysql.escape_string(article_name_english) + "'", "'" + pymysql.escape_string(article_name_chinese) + "'", "'" + pymysql.escape_string(article_content_english) + "'", "'" + pymysql.escape_string(article_content_chinese) + "'") cursor.execute(sql) # # 提交到数据库执行 print("id:%i数据爬取成功!" % id) db.commit() # 主函数 if __name__ == '__main__': for year in range(2015, 2017): get_data(year) else: print("数据爬取完成!") db.close()
程序可能存在部分bug,欢迎交流指正。ajax