环境
CharmPy,Python 3.7
Chapter 1 获取标题
简单版
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
bs = BeautifulSoup(html.read(), features="html.parser")
print(bs.html.body.h1)
print(bs.body.h1)
print(bs.h1)
复制代码
console:
An Interesting Title
An Interesting Title
An Interesting Title
复制代码
完整版
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except(HTTPError, URLError) as e:
print(e)
return None
try:
bsObj = BeautifulSoup(html.read(), features="html.parser")
title = bsObj.body.h1
except AttributeError as e:
print(e)
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
print(title)
复制代码
Chapter 2
2.1 findAll()
,获取特定标签的所有内容
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html.read(), features="html.parser")
nameList = bsObj.findAll("span", {"class": "green"})
for name in nameList:
print(name.get_text())
bsObj.findAll("span", {"class":{"green","red"}})
bsObj.findAll({"h1","h2","h3"})
# 下面两句等价
bsObj.findAll(id="text")
bsObj.findAll("",{"id":"text"})
复制代码
2.2 正则
获取特定路径规则的图片
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html.read(), features="html.parser")
# 匹配以../img/gifts/img开头,以.jpg结尾的图片
images = bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
print(image["src"])
复制代码
console:
../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg
复制代码
Chapter 3 批量采集
3.1 遍历单个域名
获取一个页面所有连接
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html.read(), features="html.parser")
links = bsObj.findAll("a")
for link in links:
if "href" in link.attrs:
print(link.attrs['href'])
# 优化过滤条件
links = bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
复制代码
迭代遍历页面的link
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
def getLinks(articalUrl):
html = urlopen("http://en.wikipedia.org" + articalUrl)
bsObj = BeautifulSoup(html.read(), features="html.parser")
return bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
random.seed(datetime.datetime.now())
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
newArticle = links[random.randint(0, len(links) - 1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
复制代码