一、使用标签名和属性
import requests
from bs4 import BeautifulSoup
url="http://www.runoob.com/html/html-intro.html"
r=requests.get(url)
html=r.text.encode(r.encoding).decode()
soup=BeautifulSoup(html,"lxml")
soup.findAll(name={"h1","h2","h3"})
len(soup.body.findAll("div",recursive=False))
divs=soup.findAll("div",attrs={"class":{"article","container navigation"}})
divs[1].findAll("h2")
二、使用文本和关键字
import requests
from bs4 import BeautifulSoup
import re
url="http://www.runoob.com/html/html-intro.html"
r=requests.get(url)
html=r.text.encode(r.encoding).decode()
soup=BeautifulSoup(html,"lxml")
soup.findAll(re.compile(""),text=("HTML 标签"))
soup.findAll({"h1","h2","h3","h4"},text=re.compile("^HTML"))
soup.findAll(class_ ={"article","container navigation"})
soup.findAll("div",id={"footer"})
三、使用lambda表达式
import requests
from bs4 import BeautifulSoup
url="http://www.runoob.com/html/html-intro.html"
r=requests.get(url)
html=r.text.encode(r.encoding).decode()
soup=BeautifulSoup(html,"lxml")
soup.findAll(lambda tag:tag.name=="h2" and len(tag.attrs)==0)
[x for x in soup.findAll("h2") if len(x.attrs)==0]
list(filter(lambda tag:len(tag.attrs)==0,soup.findAll("h2")))
四、使用正则表达式
import requests
from bs4 import BeautifulSoup
url="http://www.runoob.com/html/html-intro.html"
r=requests.get(url)
html=r.text.encode(r.encoding).decode()
soup=BeautifulSoup(html,"lxml")
import re
soup.findAll(re.compile("h[1-9]"))
soup.findAll(re.compile("h[1-9]"),text=re.compile(".*(HTML)|(html).*"))
soup.findAll("a",attrs={"href":re.compile("^//(www)|(https\://www).*")})
五、使用导航树
import requests
from bs4 import BeautifulSoup
url="http://www.runoob.com/html/html-intro.html"
r=requests.get(url)
html=r.text.encode(r.encoding).decode()
soup=BeautifulSoup(html,"lxml")
len(list(soup.body.children))
len(list(soup.body.descendants))
len(list(soup.body.find("div").next_siblings))
soup.body.find("div").parent.name