#再端一碗BeautifulSoup #获取《战争与和平》中的人物名字 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html,'html.parser') #namelist = bsObj.findAll("span",{"class":"green"}) #for name in namelist: # print(name.get_text()) name_number = bsObj.findAll(text='the prince') print(len(name_number)) allText = bsObj.findAll(id="text") #bsObj.findAll(id='text')等价于bsObj.findAll(" ",{"id":"text"}) print(allText[0].get_text()) #get_text()会把正在处理的HTML文档中所有的标签都清除,然后返回一个只包含文字的字符串。 #通常在准备打印、存储和操作数据时,应该最后才使用get_text() #BeautifulSoup的find()和findAll() #其定义如下 #findAll(tag,attributes,recursive,text,limit,keywords) #find(tag,attributes,recursive,text,keywords) #find等价于findAll的limit等于1时的情形 #如果只对网页中获取的前X项结果感兴趣,就可以设置它 #但是得注意这个参数设置之后,获得的前几项结果是按照网页上的顺序排序的 #未必是想要的那几项 #其他BeautifulSoup对象 #1.NavigableSring对象:用来表示标签里的文字 #2.Comment对象:用来查找HTML文档的注释标签, #子标签和后代标签 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html,'html.parser') for child in bsObj.find("table",{"id":"giftList"}).children: print(child) #处理兄弟标签 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html,'html.parser') for sibling in bsObj.find('table',{'id':'giftList'}).tr.next_siblings: print(sibling) #父标签处理 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen('http://www.pythonscraping.com/pages/page3.html') bsObj = BeautifulSoup(html,'html.parser') print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) #正则表达式 #通过商品图片的文件路径查找 from urllib.request import urlopen from bs4 import BeautifulSoup import re html = urlopen('http://www.pythonscraping.com/pages/page3.html') bsObj = BeautifulSoup(html,'html.parser') images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")}) for image in images: print(image["src"])