Python HTML解析库Beautiful Soup

简介

Beautiful Soup 是 Python 的 HTML/XML 解析器,能够很好地处理不规范标记并生成剖析树(parse tree)。html

Beautiful Soup 提供简单实用的导航,搜索以及修改剖析树的操做,大大节省编程时间。python

本文代码web




安装

pip install lxml beautifulsoup4




初试

测试页面正则表达式

<html>
<head><title>Page title</title></head>
<body>
<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>

长这样
在这里插入图片描述
代码编程

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.contents[0].name)  # 第一个节点的名字
# 'html'
print(soup.contents[0].contents[0].name)  # 第一个节点的第一个节点的名字
# 'head'

head = soup.contents[0].contents[0]
print(head.parent.name)  # 父节点
# 'html'

print(head.next)  # 下一个节点
# <title>Page title</title>

print(head.nextSibling.name)  # 下一个兄弟节点的名字
# 'body'

print(head.nextSibling.contents[0])
# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>

print(head.nextSibling.contents[0].nextSibling)
# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>




搜索标签和属性

  1. .:取节点
  2. .string:取内容
  3. bs4.BeautifulSoup('xxx'):查找标签
import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

titleTag = soup.html.head.title  # 取节点
print(titleTag)
# <title>Page title</title>

print(titleTag.string)  # 取内容
# 'Page title'

print(soup('p'))  # 查找标签
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll('p', align="center"))  # 指定属性查找全部,至关于soup('p', align="center")
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
print(soup('p', align="center"))  # 同上

print(soup.find('p', align="center"))  # 只查找一个
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>

print(soup('p', align="center")[0]['id'])  # 取出id
# 'firstpara'

print(soup.find('p', align=re.compile('^b.*'))['id'])  # 查找align为'b'开头的元素
# 'secondpara'

print(soup.find('p').b.string)  # p元素 → b元素的内容
# 'one'

print(soup('p')[1].b.string)  # 全部p元素 → 第二个 → b元素的内容
# 'two'




导航

属性 含义
parent 父节点
contents 子节点
string 字符串内容
nextSibling 下一个兄弟节点
previousSibling 上一个兄弟节点
next 下一层处理次序
previous 上一层处理次序
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.head.parent)  # 父节点
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>

print(soup.head.contents)  # 子节点
print(soup.p.contents)  # 子节点
# [<title>Page title</title>]
# ['This is paragraph ', <b>one</b>, '.']

print(soup.b.string)  # 字符串内容
# one

print(soup.head.nextSibling)  # 下一个兄弟节点
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.body.previousSibling)  # 上一个兄弟节点
# <head><title>Page title</title></head>

print(soup.head.next)  # 下一层处理次序
print(soup.head.next.next)  # 下一层处理次序
print(soup.head.next.next.next)  # 下一层处理次序
# <title>Page title</title>
# Page title
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.head.previous)  # 上一层处理次序
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>




搜索

方法 含义
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) 全部匹配元素
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) 第一个匹配元素
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) 后面全部匹配兄弟节点
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs) 后面第一个匹配兄弟节点
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) 前面全部匹配兄弟节点
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs) 前面第一个匹配兄弟节点
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs) 下层全部匹配元素
def findNext(self, name=None, attrs={}, text=None, **kwargs) 下层第一个匹配元素
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs) 上层全部匹配元素
def findPrevious(self, name=None, attrs={}, text=None, **kwargs) 上层第一个匹配元素
def findParents(self, name=None, attrs={}, limit=None, **kwargs) 全部匹配父节点
def findParent(self, name=None, attrs={}, **kwargs) 第一个匹配父节点

1. 全部匹配

def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)bash

import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.findAll('b'))  # 提取全部匹配元素
# [<b>one</b>, <b>two</b>]

print(soup.findAll(re.compile('^b')))  # 以b开头
# [<body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(['title', 'p']))  # title和p
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll({'title': True, 'p': True}))  # 同上,更快
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(lambda tag: len(tag.attrs) == 2))  # 传一个返回布尔值的callable对象
print(soup.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs))  # 单个字符的标签名且无属性
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<b>one</b>, <b>two</b>]

print(soup.findAll(align="center"))  # 指定属性筛选
print(soup.findAll(id=re.compile("para$")))  # 能够传字符串,正则表达式,列表,哈希表
print(soup.findAll(align=["center", "blah"]))
print(soup.findAll(align=lambda value: value and len(value) < 5))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(align=True))  # 匹配有align属性的元素
print(soup.findAll(align=None))  # 匹配无align属性的元素
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <head><title>Page title</title></head>, <title>Page title</title>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(id=re.compile("para$")))  # 与保留字有冲突时使用attrs参数,传入字典
print(soup.findAll(attrs={'id': re.compile("para$")}))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(text="one"))  # 匹配内容
print(soup.findAll(text=["one", "two"]))  # 能够传字符串,正则表达式,列表,哈希表
print(soup.findAll(text=re.compile("paragraph")))
print(soup.findAll(text=True))
print(soup.findAll(text=lambda x: len(x) < 12))
# ['one']
# ['one', 'two']
# ['This is paragraph ', 'This is paragraph ']
# ['Page title', 'This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# ['Page title', 'one', '.', 'two', '.']

print([tag.name for tag in soup.html.findAll()])  # 默认递归遍历
print([tag.name for tag in soup.html.findAll(recursive=False)])  # 不递归遍历
# ['head', 'title', 'body', 'p', 'b', 'p', 'b']
# ['head', 'body']

print(soup.findAll('p', limit=1))  # 最大匹配个数
print(soup.findAll('p', limit=100))  #
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

2. 第一个匹配

def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)svg

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p'))  # 提取第一个匹配元素
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>

3. 兄弟节点

全部兄弟节点测试

def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)ui

一个兄弟节点spa

def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find(text='This is paragraph ').findNextSiblings('b'))  # 后面全部匹配兄弟节点
print(soup.find(text='This is paragraph ').findNextSibling(text=lambda text: len(text) == 1))  # 后面第一个匹配兄弟节点
print(soup.find(text='.').findPreviousSiblings('b'))  # 前面全部匹配兄弟节点
print(soup.find(text='.').findPreviousSibling(text=True))  # 前面第一个匹配兄弟节点
# [<b>one</b>]
# .
# [<b>one</b>]
# This is paragraph 

4. 上下层

下层全部匹配元素

def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)

下层第一个匹配元素

def findNext(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p').findAllNext(text=True))  # 下层全部含text的元素
print(soup.find('p').findNext('p'))  # 第一个p的下一个p
print(soup.find('p').findNext('b'))  # 第一个p的下一个b
# ['This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>
# <b>one</b>

print(soup('p')[-1].findAllPrevious(text=True))  # 上层全部含text的元素
print(soup('p')[-1].findPrevious('p'))
print(soup('p')[-1].findPrevious('b'))
# ['.', 'one', 'This is paragraph ', 'Page title']
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
# <b>one</b>

5. 父节点

全部匹配父节点

def findParents(self, name=None, attrs={}, limit=None, **kwargs)

第一个匹配父节点

def findParent(self, name=None, attrs={}, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('b').findParents())  # 全部匹配父节点
print(soup.find('b').findParent('body'))  # 第一个匹配父节点
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>]
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>




查找class

传参 class_

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p class="firstpara" align="center">This is paragraph <b>one</b>.</p><p class="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器
print(soup(class_='firstpara'))
# [<p align="center" class="firstpara">This is paragraph <b>one</b>.</p>]




美化

prettify()

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')
print(soup.prettify())




解析表格

使用 PrettyTable

pip install prettytable
from bs4 import BeautifulSoup
from itertools import zip_longest
from prettytable import PrettyTable

html = '''<html><body> <table border="1"> <tr> <th>学号</th> <th>姓名</th> </tr> <tr> <td>1</td> <td>张三</td> </tr> <tr> <td>2</td> <td>李四</td> </tr> <tr> <td>3</td> <td>王五</td> </tr> </table> </body> </html> '''

soup = BeautifulSoup(html, 'lxml')
th = soup('th')  # 表头
th = [i.string for i in th]
td = soup('td')  # 单元格
td = [i.string for i in td]
td = list(zip_longest(*([iter(td)] * len(th))))  # 根据th的长度分组
print(th)
print(td)

x = PrettyTable()
x.field_names = th  # 表头
for i in td:
    x.add_row(i)  # 添加一行数据
print(x)
# ['学号', '姓名']
# [('1', '张三'), ('2', '李四'), ('3', '王五')]
# +------+------+
# | 学号 | 姓名 |
# +------+------+
# | 1 | 张三 |
# | 2 | 李四 |
# | 3 | 王五 |
# +------+------+

推荐阅读:Python表格美化库PrettyTable中文文档




修改树

推荐阅读:修改剖析树




报错 bs4.FeatureNotFound: Couldn’t find a tree builder with the features you requested

pip install wheel
pip install -U lxml




参考文献

  1. Beautiful Soup: We called him Tortoise because he taught us.
  2. Beautiful Soup 官方文档
  3. Beautiful Soup 中文文档
  4. BeautifulSoup库报错:bs4.FeatureNotFound: Couldnt find a tree builder with the features you requested