Python 网络数据采集

环境

CharmPy,Python 3.7

Chapter 1 获取标题

简单版

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
bs = BeautifulSoup(html.read(), features="html.parser")
print(bs.html.body.h1)
print(bs.body.h1)
print(bs.h1)
复制代码

console:

An Interesting Title

An Interesting Title

An Interesting Title

复制代码

完整版

from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup


def getTitle(url):
    try:
        html = urlopen(url)
    except(HTTPError, URLError) as e:
        print(e)
        return None
    try:
        bsObj = BeautifulSoup(html.read(), features="html.parser")
        title = bsObj.body.h1
    except AttributeError as e:
        print(e)
        return None
    return title


title = getTitle("http://www.pythonscraping.com/pages/page1.html")
print(title)
复制代码

Chapter 2

2.1 findAll(),获取特定标签的所有内容

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html.read(), features="html.parser")
nameList = bsObj.findAll("span", {"class": "green"})
for name in nameList:
    print(name.get_text())
    

bsObj.findAll("span", {"class":{"green","red"}})
bsObj.findAll({"h1","h2","h3"})

# 下面两句等价
bsObj.findAll(id="text")
bsObj.findAll("",{"id":"text"})
复制代码

2.2 正则

获取特定路径规则的图片

import re
from urllib.request import urlopen

from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html.read(), features="html.parser")
# 匹配以../img/gifts/img开头,以.jpg结尾的图片
images = bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for image in images:
    print(image["src"])
复制代码

console:

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg
复制代码

Chapter 3 批量采集

3.1 遍历单个域名

获取一个页面所有连接

from urllib.request import urlopen

from bs4 import BeautifulSoup

html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html.read(), features="html.parser")
links = bsObj.findAll("a")
for link in links:
    if "href" in link.attrs:
        print(link.attrs['href'])

# 优化过滤条件
links = bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

复制代码

迭代遍历页面的link

from urllib.request import urlopen

from bs4 import BeautifulSoup
import re
import datetime
import random


def getLinks(articalUrl):
    html = urlopen("http://en.wikipedia.org" + articalUrl)
    bsObj = BeautifulSoup(html.read(), features="html.parser")
    return bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))


random.seed(datetime.datetime.now())
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
    newArticle = links[random.randint(0, len(links) - 1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

复制代码

你可能感兴趣的:(python)