1. 在指定网站爬取指定class的信息:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)
nameList = bsObj.findAll("span", {"class":"green"})
for name in nameList:
print(name.get_text())
2. find和findAll函数的情况
findAll(tag,attributes,rescursive,text,limit,keywords)
find(tag,attributes,rescursive,text,keywords)
tag 为标签名称
findAll({"h1","h2","h3"})
attributes 是对应的属性值
nameList = bsObj.findAll("span", {"class":"green"})
rescursive 是布尔值
True是所有标签
Fasle就只查一级标签
text是用标签的文本内容去匹配
比如:
nameList = bsObj.findAll(text="the prince")
print(len(nameList))
limit
find 等价于 findAll 的limit等于1的情形
keyword
指定那些具有指定属性的标签
allText = bsObj.findAll(id="text")
print(allText[0].get_text())
3 . 处理子标签和后代标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
for child in bsObj.find("table", {"id": "giftList"}).children:
print(child)
for child in bsObj.find("table", {"id": "giftList"}).descendants:
print(child)
#.children是寻找table标签的所有子标签
#.descendants是寻找table标签下的所有标签
4. 处理兄弟标签
for child in bsObj.find("table", {"id": "giftList"}).tr.next_siblings:
print(child)
#输出的只是table标签下,除第一个tr标签的其他兄弟标签,因为tr本身不是他自己的兄弟
#next_siblings只包含选定标签后的标签
#此外还有previous_siblings, next_sibling, previous_sibling选项
5. 父标签
print(bsObj.find("img", {"src": "../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
#先找到img标签,找到他父标签,再找这个父标签的上一个子标签,再get_text()