Python HTML解析库Beautiful Soup

时间 2020-08-08 标签 python html 解析库 beautiful soup

文章目录

简介

Beautiful Soup 是 Python 的 HTML/XML 解析器，能够很好地处理不规范标记并生成剖析树(parse tree)。html

Beautiful Soup 提供简单实用的导航，搜索以及修改剖析树的操做，大大节省编程时间。python

安装

pip install lxml beautifulsoup4

初试

测试页面正则表达式

<html>
<head><title>Page title</title></head>
<body>
<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>

长这样

代码编程

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.contents[0].name)  # 第一个节点的名字
# 'html'
print(soup.contents[0].contents[0].name)  # 第一个节点的第一个节点的名字
# 'head'

head = soup.contents[0].contents[0]
print(head.parent.name)  # 父节点
# 'html'

print(head.next)  # 下一个节点
# <title>Page title</title>

print(head.nextSibling.name)  # 下一个兄弟节点的名字
# 'body'

print(head.nextSibling.contents[0])
# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>

print(head.nextSibling.contents[0].nextSibling)
# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>

搜索标签和属性

.：取节点
.string：取内容
bs4.BeautifulSoup('xxx')：查找标签

import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

titleTag = soup.html.head.title  # 取节点
print(titleTag)
# <title>Page title</title>

print(titleTag.string)  # 取内容
# 'Page title'

print(soup('p'))  # 查找标签
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll('p', align="center"))  # 指定属性查找全部，至关于soup('p', align="center")
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
print(soup('p', align="center"))  # 同上

print(soup.find('p', align="center"))  # 只查找一个
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>

print(soup('p', align="center")[0]['id'])  # 取出id
# 'firstpara'

print(soup.find('p', align=re.compile('^b.*'))['id'])  # 查找align为'b'开头的元素
# 'secondpara'

print(soup.find('p').b.string)  # p元素 → b元素的内容
# 'one'

print(soup('p')[1].b.string)  # 全部p元素 → 第二个 → b元素的内容
# 'two'

属性	含义
parent	父节点
contents	子节点
string	字符串内容
nextSibling	下一个兄弟节点
previousSibling	上一个兄弟节点
next	下一层处理次序
previous	上一层处理次序

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.head.parent)  # 父节点
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>

print(soup.head.contents)  # 子节点
print(soup.p.contents)  # 子节点
# [<title>Page title</title>]
# ['This is paragraph ', <b>one</b>, '.']

print(soup.b.string)  # 字符串内容
# one

print(soup.head.nextSibling)  # 下一个兄弟节点
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.body.previousSibling)  # 上一个兄弟节点
# <head><title>Page title</title></head>

print(soup.head.next)  # 下一层处理次序
print(soup.head.next.next)  # 下一层处理次序
print(soup.head.next.next.next)  # 下一层处理次序
# <title>Page title</title>
# Page title
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.head.previous)  # 上一层处理次序
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>

搜索

方法	含义
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)	全部匹配元素
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)	第一个匹配元素
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)	后面全部匹配兄弟节点
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)	后面第一个匹配兄弟节点
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)	前面全部匹配兄弟节点
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs)	前面第一个匹配兄弟节点
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)	下层全部匹配元素
def findNext(self, name=None, attrs={}, text=None, **kwargs)	下层第一个匹配元素
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs)	上层全部匹配元素
def findPrevious(self, name=None, attrs={}, text=None, **kwargs)	上层第一个匹配元素
def findParents(self, name=None, attrs={}, limit=None, **kwargs)	全部匹配父节点
def findParent(self, name=None, attrs={}, **kwargs)	第一个匹配父节点

1. 全部匹配

def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)bash

import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.findAll('b'))  # 提取全部匹配元素
# [<b>one</b>, <b>two</b>]

print(soup.findAll(re.compile('^b')))  # 以b开头
# [<body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(['title', 'p']))  # title和p
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll({'title': True, 'p': True}))  # 同上，更快
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(lambda tag: len(tag.attrs) == 2))  # 传一个返回布尔值的callable对象
print(soup.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs))  # 单个字符的标签名且无属性
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<b>one</b>, <b>two</b>]

print(soup.findAll(align="center"))  # 指定属性筛选
print(soup.findAll(id=re.compile("para$")))  # 能够传字符串，正则表达式，列表，哈希表
print(soup.findAll(align=["center", "blah"]))
print(soup.findAll(align=lambda value: value and len(value) < 5))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(align=True))  # 匹配有align属性的元素
print(soup.findAll(align=None))  # 匹配无align属性的元素
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <head><title>Page title</title></head>, <title>Page title</title>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(id=re.compile("para$")))  # 与保留字有冲突时使用attrs参数，传入字典
print(soup.findAll(attrs={'id': re.compile("para$")}))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(text="one"))  # 匹配内容
print(soup.findAll(text=["one", "two"]))  # 能够传字符串，正则表达式，列表，哈希表
print(soup.findAll(text=re.compile("paragraph")))
print(soup.findAll(text=True))
print(soup.findAll(text=lambda x: len(x) < 12))
# ['one']
# ['one', 'two']
# ['This is paragraph ', 'This is paragraph ']
# ['Page title', 'This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# ['Page title', 'one', '.', 'two', '.']

print([tag.name for tag in soup.html.findAll()])  # 默认递归遍历
print([tag.name for tag in soup.html.findAll(recursive=False)])  # 不递归遍历
# ['head', 'title', 'body', 'p', 'b', 'p', 'b']
# ['head', 'body']

print(soup.findAll('p', limit=1))  # 最大匹配个数
print(soup.findAll('p', limit=100))  #
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

2. 第一个匹配

def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)svg

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p'))  # 提取第一个匹配元素
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>

3. 兄弟节点

全部兄弟节点测试

def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)ui

一个兄弟节点spa

def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find(text='This is paragraph ').findNextSiblings('b'))  # 后面全部匹配兄弟节点
print(soup.find(text='This is paragraph ').findNextSibling(text=lambda text: len(text) == 1))  # 后面第一个匹配兄弟节点
print(soup.find(text='.').findPreviousSiblings('b'))  # 前面全部匹配兄弟节点
print(soup.find(text='.').findPreviousSibling(text=True))  # 前面第一个匹配兄弟节点
# [<b>one</b>]
# .
# [<b>one</b>]
# This is paragraph

4. 上下层

下层全部匹配元素

def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)

下层第一个匹配元素

def findNext(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p').findAllNext(text=True))  # 下层全部含text的元素
print(soup.find('p').findNext('p'))  # 第一个p的下一个p
print(soup.find('p').findNext('b'))  # 第一个p的下一个b
# ['This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>
# <b>one</b>

print(soup('p')[-1].findAllPrevious(text=True))  # 上层全部含text的元素
print(soup('p')[-1].findPrevious('p'))
print(soup('p')[-1].findPrevious('b'))
# ['.', 'one', 'This is paragraph ', 'Page title']
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
# <b>one</b>

5. 父节点

全部匹配父节点

def findParents(self, name=None, attrs={}, limit=None, **kwargs)

第一个匹配父节点

def findParent(self, name=None, attrs={}, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('b').findParents())  # 全部匹配父节点
print(soup.find('b').findParent('body'))  # 第一个匹配父节点
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>]
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

查找class

传参 class_

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p class="firstpara" align="center">This is paragraph <b>one</b>.</p><p class="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器
print(soup(class_='firstpara'))
# [<p align="center" class="firstpara">This is paragraph <b>one</b>.</p>]

美化

prettify()

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')
print(soup.prettify())

解析表格

使用 PrettyTable 库

pip install prettytable

from bs4 import BeautifulSoup
from itertools import zip_longest
from prettytable import PrettyTable

html = '''<html><body> <table border="1"> <tr> <th>学号</th> <th>姓名</th> </tr> <tr> <td>1</td> <td>张三</td> </tr> <tr> <td>2</td> <td>李四</td> </tr> <tr> <td>3</td> <td>王五</td> </tr> </table> </body> </html> '''

soup = BeautifulSoup(html, 'lxml')
th = soup('th')  # 表头
th = [i.string for i in th]
td = soup('td')  # 单元格
td = [i.string for i in td]
td = list(zip_longest(*([iter(td)] * len(th))))  # 根据th的长度分组
print(th)
print(td)

x = PrettyTable()
x.field_names = th  # 表头
for i in td:
    x.add_row(i)  # 添加一行数据
print(x)
# ['学号', '姓名']
# [('1', '张三'), ('2', '李四'), ('3', '王五')]
# +------+------+
# | 学号 | 姓名 |
# +------+------+
# | 1 | 张三 |
# | 2 | 李四 |
# | 3 | 王五 |
# +------+------+

推荐阅读：Python表格美化库PrettyTable中文文档

修改树

报错 bs4.FeatureNotFound: Couldn’t find a tree builder with the features you requested

pip install wheel
pip install -U lxml