#!/usr/bin/env python # -*- coding:utf-8 -*- # Author: Trony import re # 正则模块用于解析网页 import requests # 用于获取网页 from docx import Document # 将获取到的古诗词信息存入word文档 # 构建函数用于获取和解析网页得到目标内容并返回 def parse_page(url): headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36" } # 请求头 response = requests.get(url, headers) text = response.text # 获取响应内容,unicode格式 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, flags=re.DOTALL) # re.DOTALL设置能够使.号匹配到空白符 # 匹配诗词名 dynasties = re.findall(r'<p\sclass="source">.*?<a\shref=".*?"\starget="_blank">(.{2})</a>.*?</p>', text, flags=re.DOTALL) # 匹配做者 authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, flags=re.DOTALL) # 匹配做者的朝代 contents_tags = re.findall(r'<div\sclass="contson".*?>(.*?)</div>', text, flags=re.DOTALL) # 匹配诗词的内容 contents = [] for content in contents_tags: # print(content) x = re.sub(r"<.*?>", "", content) contents.append(x.strip()) peoms = [] for i in range(len(titles)-1): peom = {} peom["title"] = titles[i] peom["author"] = authors[i] peom["dynastie"] = dynasties[i] peom["content"] = contents[i] peoms.append(peom) return peoms def main(): peoms = [] for i in range(1, 101): url = 'https://www.gushiwen.org/default_%s.aspx'%i l = parse_page(url) peoms += l document = Document() # 建立一个word文件对象 for i in range(len(peoms)-1): document.add_heading(u'%s'%peoms[i]["title"], 2) # 将名字设置为二级标题 paragraph = document.add_paragraph(u'%s %s'%(peoms[i]["author"], peoms[i]["dynastie"])) paragraph = document.add_paragraph(u'%s\n'%peoms[i]["content"]) document.save('demo.docx') # 保存文件 if __name__ == "__main__": main()