网络爬虫:各种模板

运行BeautifulSoup


import urllib.request
import urllib.error
from bs4 import BeautifulSoup

def get_title(url):
    try:
        req=urllib.request.Request(url)
        response=urllib.request.urlopen(req)
    except (urllib.error.HTTPError,urllib.error.URLError) as e:
        #网页在服务器不存在,或者服务器不存在
        print(e)
        return None
    try:
        html = response.read().decode("utf-8")
        soup = BeautifulSoup(html, "html.parser")
        title=soup.body.h1
    except AttributeError as e:
        #标签不存在
        return None
    return title

if __name__=="__main__":
    url="http://www.pythonscraping.com/exercises/exercise1.html"
    title=get_title(url)
    if title == None:
        print("Title could not be found")
    else:
        print(title)

我们创建了一个getTitle函数,可以返回网页的标题,如果获取网页的时候遇到问题就返回一个None对象。在getTitle函数里,检查了HTTPError,还检查了由于URL输入错误引起的URLError,然后把BeautifulSoup代码封装在一个try语句里面。这两行中的任何一行有问题,AttributeError 都可能被抛出(如果服务器不存在,html就是一个None对象,html.read()就会抛出AttributeError )。其实,我们可以在try语句里面放任意多行代码,或者放一个在任何位置都可以抛出AttributeError 的函数。

运行BeautifulSoup的find()和findAll()

# nameList = bsObj.findAll("span", {"class":"green"})
nameList = bsObj.findAll("span", class_="green")
for name in nameList:
    print(name.get_text())

更详细见:http://blog.csdn.net/chaowanghn/article/details/54646683

处理子标签、兄弟标签和父标签

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")

for child in bsObj.find("table",{"id":"giftList"}).children:
    print(child)
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")

for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling) 
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

查找图片

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images: 
    print(image["src"])
    #print(image.attrs["src"])

查找网址链接

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
    bsObj = BeautifulSoup(html, "html.parser")
    links=bsObj.find("a")
    # links=bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
    for link in links:
        if "href" in link.attrs:
            print (link.attrs["href"])

更换请求头

# 增加headers,模拟登陆,而不是对服务器识别为机器登陆。使用移动设备浏览网站时,通常会看到一个没有广告的、Flash以及其他干扰的简化的网站版本。
url="http://baidu.com"
headers={}
headers["User-Agent"]="Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"

req=urllib.request.Request(url,headers=headers)
# req=urllib.request.Request(url,data,headers)
#或者使用Request.add_header(key,value)
# req=urllib.request.Request(url,data)
# req.add_header("User-Agent","Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53")

response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")

print (html)

URL管理器— urllib.request下载网页:添加特殊情景的处理器

urllib.request.HTTPCookieProcessor添加cookie登录
urllib.request.ProxyHandler添加代理
urllib.request.HTTPSHandler处理https加密访问的网页
urllib.request.HTTPRedirectHandler处理URL相互自动的跳转关系
网络爬虫:各种模板_第1张图片

#比如使用cookie访问网页
import urllib.request, http.cookiejar
from bs4 import BeautifulSoup

#创建cookie容器
cj=http.cookiejar.CookieJar()

#创建1个opener
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))

#给urllib安装opener
urllib.request.install_opener(opener)

#使用带有cookie的urllib访问网页
#response=urllib.request.urlopen("http://www.baidu.com")

url="http://www.baidu.com"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print (soup.prettify())
#_*_ coding:utf-8 _*_

import urllib.request
import http.cookiejar

url="http://www.baidu.com"

print ("测试第一种方法")
response1=urllib.request.urlopen(url)
print (response1.getcode())
print (len(response1.read()))

print ("测试第二种方法")
req2=urllib.request.Request(url)
req2.add_header("User-Agent","Mozilla/5.0")
response2=urllib.request.urlopen(req2)
print (response2.getcode())
print (len(response2.read()))

print ("测试第三种方法")
#创建cookie容器
cj=http.cookiejar.CookieJar()
#创建1个opener
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#给urllib安装opener
urllib.request.install_opener(opener)
#使用带有cookie的urllib访问网页
response3=urllib.request.urlopen(url)
print (response3.getcode())
print (len(response3.read()))
print (cj) #打印cookie的内容

网页解析器

网页解析器
网页解析器:从网页中提取有价值数据的工具
网页解析器方法:正则表达式、html.parse、BeautifulSoup、lxml
网络爬虫:各种模板_第2张图片
网络爬虫:各种模板_第3张图片

你可能感兴趣的:(python)