Python语言学习:BeautifulSoup/bs/bs4 学习

采集中经常用到的,类似于nodejs里的 cheerio
更新于: 2022-04-24 01:41:17

安装

pip install beautifulsoup4

常用方法列表速查表

from bs4 import BeautifulSoup
功能代码
取得页面title
soup = BeautifulSoup(html_doc, 'html.parser')

# 取得 bs4.Element.Tag
print(soup.title)
# 取得 title 的字符串
print(soup.title.text)
取得html
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
select_one
soup = BeautifulSoup(html_doc, 'html.parser')
# 类似于JS里的doc.querySelector
soup.select_one(".test_cls")
select
soup = BeautifulSoup(html_doc, 'html.parser')
# 类似于JS里的doc.querySelectorAll
soup.select(".test_cls_list")
find_all
# 找出 doc 中所有的 a 标签
for link in soup.find_all('a'):
    print(link.get('href'))
    # http://example.com/elsie
    # http://example.com/lacie
    # http://example.com/tillie
find_all 组合条件
soup.find_all("a", attrs={"class": "sister"})
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
含有字符串
soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]
limit,限制前2条
soup.find_all("a", limit=2)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
tagName
tag.name = "blockquote"
tag
# <blockquote class="boldest">Extremely bold</blockquote>
attrs/class等属性
# 取得属性
tag.attrs
# {u'class': u'boldest'}
# class,这种是特殊的
tag['class']
替换字符串
# tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法:
tag.string.replace_with("No longer bold")
contents
head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>

head_tag.contents
[<title>The Dormouse's story</title>]

title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>
title_tag.contents
# [u'The Dormouse's story']
.strings 和 stripped_strings
# stripped_strings,是已经去除空行的字符串
for string in soup.strings:
    print(repr(string))
    # u"The Dormouse's story"
    # u'\n\n'
    # u"The Dormouse's story"
    # u'\n\n'
    # u'Once upon a time there were three little sisters; and their names were\n'
    # u'Elsie'
    # u',\n'
    # u'Lacie'
    # u' and\n'
    # u'Tillie'
    # u';\nand they lived at the bottom of a well.'
    # u'\n\n'
    # u'...'
    # u'\n'

BeautifulSoup 格式化 html

from bs4 import BeautifulSoup
soup = BeautifulSoup('file.html')
prettified = soup.prettify(encoding="utf8")

htmlmin 压缩html

import htmlmin

with open('file.html', 'r') as f:
    content = f.read()
    minified = htmlmin.minify(content, remove_empty_space=True)

XML 的处理

pip install lxml
import requests
from bs4 import BeautifulSoup

response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')
print(soup.prettify())
功能代码
find 取得元素
import requests
from bs4 import BeautifulSoup

response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')
accession  = soup.find('accession')

print(accession)
# <accession>A0A482D497</accession>
select_one 取得单个元素
import requests
from bs4 import BeautifulSoup

response = requests.get('https://www.uniprot.org/uniprot/A0A482D497.xml')
soup = BeautifulSoup(response.content, 'xml')

submittedName = soup.select_one('protein submittedName')
print(submittedName.text)
# Uncharacterized protein

参考