Beautiful Soup 是 Python 的 HTML/XML 解析器,能够很好地处理不规范标记并生成剖析树(parse tree)。html
Beautiful Soup 提供简单实用的导航,搜索以及修改剖析树的操做,大大节省编程时间。python
本文代码web
pip install lxml beautifulsoup4
测试页面正则表达式
<html> <head><title>Page title</title></head> <body> <p id="firstpara" align="center">This is paragraph <b>one</b>.</p> <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p> </body> </html>
长这样
代码编程
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.contents[0].name) # 第一个节点的名字 # 'html' print(soup.contents[0].contents[0].name) # 第一个节点的第一个节点的名字 # 'head' head = soup.contents[0].contents[0] print(head.parent.name) # 父节点 # 'html' print(head.next) # 下一个节点 # <title>Page title</title> print(head.nextSibling.name) # 下一个兄弟节点的名字 # 'body' print(head.nextSibling.contents[0]) # <p id="firstpara" align="center">This is paragraph <b>one</b>.</p> print(head.nextSibling.contents[0].nextSibling) # <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
.
:取节点.string
:取内容bs4.BeautifulSoup('xxx')
:查找标签import re from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 titleTag = soup.html.head.title # 取节点 print(titleTag) # <title>Page title</title> print(titleTag.string) # 取内容 # 'Page title' print(soup('p')) # 查找标签 # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] print(soup.findAll('p', align="center")) # 指定属性查找全部,至关于soup('p', align="center") # [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>] print(soup('p', align="center")) # 同上 print(soup.find('p', align="center")) # 只查找一个 # <p id="firstpara" align="center">This is paragraph <b>one</b>. </p> print(soup('p', align="center")[0]['id']) # 取出id # 'firstpara' print(soup.find('p', align=re.compile('^b.*'))['id']) # 查找align为'b'开头的元素 # 'secondpara' print(soup.find('p').b.string) # p元素 → b元素的内容 # 'one' print(soup('p')[1].b.string) # 全部p元素 → 第二个 → b元素的内容 # 'two'
属性 | 含义 |
---|---|
parent | 父节点 |
contents | 子节点 |
string | 字符串内容 |
nextSibling | 下一个兄弟节点 |
previousSibling | 上一个兄弟节点 |
next | 下一层处理次序 |
previous | 上一层处理次序 |
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.head.parent) # 父节点 # <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html> print(soup.head.contents) # 子节点 print(soup.p.contents) # 子节点 # [<title>Page title</title>] # ['This is paragraph ', <b>one</b>, '.'] print(soup.b.string) # 字符串内容 # one print(soup.head.nextSibling) # 下一个兄弟节点 # <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body> print(soup.body.previousSibling) # 上一个兄弟节点 # <head><title>Page title</title></head> print(soup.head.next) # 下一层处理次序 print(soup.head.next.next) # 下一层处理次序 print(soup.head.next.next.next) # 下一层处理次序 # <title>Page title</title> # Page title # <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body> print(soup.head.previous) # 上一层处理次序 # <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>
方法 | 含义 |
---|---|
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) | 全部匹配元素 |
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) | 第一个匹配元素 |
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 后面全部匹配兄弟节点 |
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs) | 后面第一个匹配兄弟节点 |
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 前面全部匹配兄弟节点 |
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs) | 前面第一个匹配兄弟节点 |
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 下层全部匹配元素 |
def findNext(self, name=None, attrs={}, text=None, **kwargs) | 下层第一个匹配元素 |
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 上层全部匹配元素 |
def findPrevious(self, name=None, attrs={}, text=None, **kwargs) | 上层第一个匹配元素 |
def findParents(self, name=None, attrs={}, limit=None, **kwargs) | 全部匹配父节点 |
def findParent(self, name=None, attrs={}, **kwargs) | 第一个匹配父节点 |
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
bash
import re from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.findAll('b')) # 提取全部匹配元素 # [<b>one</b>, <b>two</b>] print(soup.findAll(re.compile('^b'))) # 以b开头 # [<body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>] print(soup.findAll(['title', 'p'])) # title和p # [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] print(soup.findAll({'title': True, 'p': True})) # 同上,更快 # [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] print(soup.findAll(lambda tag: len(tag.attrs) == 2)) # 传一个返回布尔值的callable对象 print(soup.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs)) # 单个字符的标签名且无属性 # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<b>one</b>, <b>two</b>] print(soup.findAll(align="center")) # 指定属性筛选 print(soup.findAll(id=re.compile("para$"))) # 能够传字符串,正则表达式,列表,哈希表 print(soup.findAll(align=["center", "blah"])) print(soup.findAll(align=lambda value: value and len(value) < 5)) # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] print(soup.findAll(align=True)) # 匹配有align属性的元素 print(soup.findAll(align=None)) # 匹配无align属性的元素 # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <head><title>Page title</title></head>, <title>Page title</title>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>] print(soup.findAll(id=re.compile("para$"))) # 与保留字有冲突时使用attrs参数,传入字典 print(soup.findAll(attrs={'id': re.compile("para$")})) # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>] print(soup.findAll(text="one")) # 匹配内容 print(soup.findAll(text=["one", "two"])) # 能够传字符串,正则表达式,列表,哈希表 print(soup.findAll(text=re.compile("paragraph"))) print(soup.findAll(text=True)) print(soup.findAll(text=lambda x: len(x) < 12)) # ['one'] # ['one', 'two'] # ['This is paragraph ', 'This is paragraph '] # ['Page title', 'This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.'] # ['Page title', 'one', '.', 'two', '.'] print([tag.name for tag in soup.html.findAll()]) # 默认递归遍历 print([tag.name for tag in soup.html.findAll(recursive=False)]) # 不递归遍历 # ['head', 'title', 'body', 'p', 'b', 'p', 'b'] # ['head', 'body'] print(soup.findAll('p', limit=1)) # 最大匹配个数 print(soup.findAll('p', limit=100)) # # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>] # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
svg
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.find('p')) # 提取第一个匹配元素 # <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
全部兄弟节点测试
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
ui
一个兄弟节点spa
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.find(text='This is paragraph ').findNextSiblings('b')) # 后面全部匹配兄弟节点 print(soup.find(text='This is paragraph ').findNextSibling(text=lambda text: len(text) == 1)) # 后面第一个匹配兄弟节点 print(soup.find(text='.').findPreviousSiblings('b')) # 前面全部匹配兄弟节点 print(soup.find(text='.').findPreviousSibling(text=True)) # 前面第一个匹配兄弟节点 # [<b>one</b>] # . # [<b>one</b>] # This is paragraph
下层全部匹配元素
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)
下层第一个匹配元素
def findNext(self, name=None, attrs={}, text=None, **kwargs)
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.find('p').findAllNext(text=True)) # 下层全部含text的元素 print(soup.find('p').findNext('p')) # 第一个p的下一个p print(soup.find('p').findNext('b')) # 第一个p的下一个b # ['This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.'] # <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p> # <b>one</b> print(soup('p')[-1].findAllPrevious(text=True)) # 上层全部含text的元素 print(soup('p')[-1].findPrevious('p')) print(soup('p')[-1].findPrevious('b')) # ['.', 'one', 'This is paragraph ', 'Page title'] # <p align="center" id="firstpara">This is paragraph <b>one</b>.</p> # <b>one</b>
全部匹配父节点
def findParents(self, name=None, attrs={}, limit=None, **kwargs)
第一个匹配父节点
def findParent(self, name=None, attrs={}, **kwargs)
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup.find('b').findParents()) # 全部匹配父节点 print(soup.find('b').findParent('body')) # 第一个匹配父节点 # [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>] # <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>
传参 class_
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p class="firstpara" align="center">This is paragraph <b>one</b>.</p><p class="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器 print(soup(class_='firstpara')) # [<p align="center" class="firstpara">This is paragraph <b>one</b>.</p>]
prettify()
from bs4 import BeautifulSoup content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>''' soup = BeautifulSoup(content, 'lxml') print(soup.prettify())
使用 PrettyTable
库
pip install prettytable
from bs4 import BeautifulSoup from itertools import zip_longest from prettytable import PrettyTable html = '''<html><body> <table border="1"> <tr> <th>学号</th> <th>姓名</th> </tr> <tr> <td>1</td> <td>张三</td> </tr> <tr> <td>2</td> <td>李四</td> </tr> <tr> <td>3</td> <td>王五</td> </tr> </table> </body> </html> ''' soup = BeautifulSoup(html, 'lxml') th = soup('th') # 表头 th = [i.string for i in th] td = soup('td') # 单元格 td = [i.string for i in td] td = list(zip_longest(*([iter(td)] * len(th)))) # 根据th的长度分组 print(th) print(td) x = PrettyTable() x.field_names = th # 表头 for i in td: x.add_row(i) # 添加一行数据 print(x) # ['学号', '姓名'] # [('1', '张三'), ('2', '李四'), ('3', '王五')] # +------+------+ # | 学号 | 姓名 | # +------+------+ # | 1 | 张三 | # | 2 | 李四 | # | 3 | 王五 | # +------+------+
推荐阅读:Python表格美化库PrettyTable中文文档
推荐阅读:修改剖析树
pip install wheel pip install -U lxml