若是安装了anaconda的集成库,则不须要再次安装
若是没有,则使用pip指令安装
pip install beautifulsoup4 -i https://pypi.tuna.tsinghua.edu.cn/simple/
html
右键网页,查看源点web
使用requests库app
import requests from bs4 import BeautifulSoup kv = {'user-agent' : 'Mozilla/5.0'} url = "https://blog.csdn.net/qq_43321732" try: r = requests.get(url, headers = kv) r.raise_for_status() demo = r.text except: print("爬取失败") soup = BeautifulSoup(demo, "html.parser") print(soup.prettify())
import requests from bs4 import BeautifulSoup kv = {'user-agent' : 'Mozilla/5.0'} url = "https://ac.nowcoder.com/acm/contest/5666" try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text except: print("爬取失败") soup = BeautifulSoup(demo, "html.parser") print(soup.title)
import requests from bs4 import BeautifulSoup kv = {'user-agent' : 'Mozilla/5.0'} url = "https://ac.nowcoder.com/acm/contest/5666" try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text except: print("爬取失败") soup = BeautifulSoup(demo, "html.parser") tag = soup.a print(tag)
import requests from bs4 import BeautifulSoup kv = {'user-agent' : 'Mozilla/5.0'} url = "https://ac.nowcoder.com/acm/contest/5666" try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text except: print("爬取失败") soup = BeautifulSoup(demo, "html.parser") tag = soup.a print(tag.attrs) print(tag.attrs['class']) #连接属性 print(tag.attrs['href']) #标签属性类型(字典类型) print(type(tag.attrs)) print(type(tag))
import requests from bs4 import BeautifulSoup kv = {'user-agent' : 'Mozilla/5.0'} url = "https://ac.nowcoder.com/acm/contest/5666" try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text except: print("爬取失败") soup = BeautifulSoup(demo, "html.parser") print(soup.head) print(soup.head.contents) print(soup.body.contents) print(len(soup.head.contents)) print(soup.head.contents[1])
平行遍历必须发生在同一个父亲节点下
svg