Python语言学习:BeautifulSoup/bs/bs4 学习
采集中经常用到的,类似于nodejs里的 cheerio
安装 pip install beautifulsoup4
常用方法列表速查表 from bs4 import BeautifulSoup
功能 代码 取得页面title soup = BeautifulSoup(html_doc, 'html.parser')
# 取得 bs4.Element.Tag
print(soup.title)
# 取得 title 的字符串
print(soup.title.text)
取得html soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
select_one soup = BeautifulSoup(html_doc, 'html.parser')
# 类似于JS里的doc.querySelector
soup.select_one(".test_cls")
select soup = BeautifulSoup(html_doc, 'html.parser')
# 类似于JS里的doc.querySelectorAll
soup.select(".test_cls_list")
find_all # 找出 doc 中所有的 a 标签
for link in soup.find_all('a'):
print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
find_all 组合条件 soup.find_all("a", attrs={"class": "sister"})
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
含有字符串 soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]
limit,限制前2条 soup.find_all("a", limit=2)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
tagName tag.name = "blockquote"
tag
# <blockquote class="boldest">Extremely bold</blockquote>
attrs/class等属性 # 取得属性
tag.attrs
# {u'class': u'boldest'}
# class,这种是特殊的
tag['class']
替换字符串 # tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法:
tag.string.replace_with("No longer bold")
contents head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>
head_tag.contents
[<title>The Dormouse's story</title>]
title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>
title_tag.contents
# [u'The Dormouse's story']
.strings 和 stripped_strings # stripped_strings,是已经去除空行的字符串
for string in soup.strings:
print(repr(string))
# u"The Dormouse's story"
# u'\n\n'
# u"The Dormouse's story"
# u'\n\n'
# u'Once upon a time there were three little sisters; and their names were\n'
# u'Elsie'
# u',\n'
# u'Lacie'
# u' and\n'
# u'Tillie'
# u';\nand they lived at the bottom of a well.'
# u'\n\n'
# u'...'
# u'\n'
用 BeautifulSoup
格式化 html
from bs4 import BeautifulSoup
soup = BeautifulSoup('file.html')
prettified = soup.prettify(encoding="utf8")
用 htmlmin
压缩html
import htmlmin
with open('file.html', 'r') as f:
content = f.read()
minified = htmlmin.minify(content, remove_empty_space=True)
XML 的处理 pip install lxml
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')
print(soup.prettify())
功能 代码 find
取得元素import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')
accession = soup.find('accession')
print(accession)
# <accession>A0A482D497</accession>
select_one
取得单个元素import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')
submittedName = soup.select_one('protein submittedName')
print(submittedName.text)
# Uncharacterized protein
参考