第二章 复杂HTML解析

1、获取指定标签内容

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj = BeautifulSoup(html, 'lxml')
namelist = bs0bj.findAll('span', {'class': 'green'}) # 获取页面所有指定标签
for name in namelist:
    print(name.get_text())

2、处理子标签

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj = BeautifulSoup(html, 'lxml')

for child in bs0bj.find('table', {'id': 'giftList'}).children:
    print(child)

3、处理兄弟标签

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
for sibling in bsObj.find("table", {"id": "giftList"}).tr.next_siblings:
    print(sibling)
# previous_siblings 前一组
# next_siblings 后一组
# previous_sibling前一个
# next_siblings后一个

4、父标签处理

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
print(bsObj.find("img", {"src": "../img/gifts/img1.jpg"
                         }).parent.previous_sibling.get_text()
      )
---------------------------------------------------------------------------
#打印输出
$15.00
---------------------------------------------------------------------------
(1) 选择图片标签src="../img/gifts/img1.jpg";
(2) 选择图片标签的父标签(在示例中是 标签);
(3) 选择 标签的前一个兄弟标签previous_sibling(在示例中是包含美元价格的
标签);
(4) 选择标签中的文字,“$15.00”。

5、正则表达式

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
images = bsObj.findAll("img", {"src": re.compile(r"../img/gifts/img.*.jpg")})
for image in images:
    print(image["src"])

你可能感兴趣的:(第二章 复杂HTML解析)