python 爬虫实践

       爬虫是python最重要的一个应用场景,用户可以根据自己的需要通过爬虫从网站上获取数据。

       爬虫按照爬取内容可以氛围通用爬虫和聚焦爬虫两类,通用爬虫是获得网页本身即可,最典型的应用是百度快照;聚焦爬虫则是获取网页元素内容,解析后存储做分析使用。

       爬虫世界主要解决以下几个问题:

       1. 如何爬取?

       爬虫具备三大特征:统一的url,html,http或者https请求。因此爬虫的过程就是通过统一的url,通过http或https协议发送get或post请求,获得统一数据展现形式的html页面。

import random
import urllib2
import urllib

# 模拟不同的浏览器,填充请求头:User-agent
ua_list=[
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
    "Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11",
    "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
    "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)"
]

# 随机选取
user_agent=random.choice(ua_list)

ua_headers={
    "User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}
url="http://www.baidu.com"
#request=urllib2.Request("http://www.baidu.com",headers=ua_headers)

# request请求
request=urllib2.Request(url)
request.add_header("User-Agent",user_agent)

print request.get_header("User-agent")

response=urllib2.urlopen(request)
html=response.read()

print response.getcode()
print response.geturl()
print response.info()

     2.  登录型网站如何处理cookie?

     简单的操作就是在请求头将网页页面的cookie内容添加入请求头,弊端是每次爬取需要在网页先登录生成cookie。

# -*- coding:utf-8 -*-

import urllib2

url = "http://www.renren.com/410043129/profile"

headers = {
    "Host" : "www.renren.com",
    "Connection" : "keep-alive",
    #"Upgrade-Insecure-Requests" : "1",
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer" : "http://www.renren.com/SysHome.do",
    #"Accept-Encoding" : "gzip, deflate, sdch",
    # 添加入网页登录cookie
    "Cookie" : "anonymid=ixrna3fysufnwv; _r01_=1; depovince=GW; jebe_key=f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1484400895379; jebe_key=f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1484400890914; JSESSIONID=abcX8s_OqSGsYeRg5vHMv; jebecookies=0c5f9b0d-03d8-4e6a-b7a9-3845d04a9870|||||; ick_login=8a429d6c-78b4-4e79-8fd5-33323cd9e2bc; _de=BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5; p=0cedb18d0982741d12ffc9a0d93670e09; ap=327550029; first_login_flag=1; [email protected]; ln_hurl=http://hdn.xnimg.cn/photos/hdn521/20140529/1055/h_main_9A3Z_e0c300019f6a195a.jpg; t=56c0c522b5b068fdee708aeb1056ee819; societyguester=56c0c522b5b068fdee708aeb1056ee819; id=327550029; xnsid=5ea75bd6; loginfrom=syshome",
    "Accept-Language" : "zh-CN,zh;q=0.8,en;q=0.6",
}

request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request)

print response.read()

       另一种方式是通过cookielib包解决:

import urllib
import urllib2
import cookielib

# 通过CookieJar()类构建一个cookieJar()对象,用来保存cookie的值
cookie = cookielib.CookieJar()

# 通过HTTPCookieProcessor()处理器类构建一个处理器对象,用来处理cookie
# 参数就是构建的CookieJar()对象
cookie_handler = urllib2.HTTPCookieProcessor(cookie)

# 构建一个自定义的opener
opener = urllib2.build_opener(cookie_handler)

# 通过自定义opener的addheaders的参数,可以添加HTTP报头参数
opener.addheaders = [("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")]

# renren网的登录接口
url = "http://www.renren.com/PLogin.do"

# 需要登录的账户密码
data = {"email":"[email protected]", "password":"alarmchime"}

# 通过urlencode()编码转换
data = urllib.urlencode(data)

# 第一次是post请求,发送登录需要的参数,获取cookie
request = urllib2.Request(url, data = data)

# 发送第一次的post请求,生成登录后的cookie(如果登录成功的话)
response = opener.open(request)

#print response.read()

# 第二次可以是get请求,这个请求将保存生成cookie一并发到web服务器,服务器会验证cookie通过
response_deng = opener.open("http://www.renren.com/410043129/profile")

# 获取登录后才能访问的页面信息
print response_deng.read()

    3. 数据爬取如何解析?

     python为数据解析提供了丰富的基础类库:re,xpath,bs4(beautifulSoup4),pyquery 

# re匹配html中class名称为f18 mb20的div元素
pattern = re.compile('(.*?)
', re.S) content_list=pattern.findall(html) # xpath匹配出所有老师的根节点列表集合 teacher_list = response.xpath('//div[@class="li_txt"]')

      4. https网站如何爬虫?

      通过ssl忽略掉安全认证

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib2
import ssl

# 忽略SSL安全认证
context = ssl._create_unverified_context()

url = "https://www.12306.cn/mormhweb/"
#url = "https://www.baidu.com/"

headers = {
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
    }
request = urllib2.Request(url, headers = headers)

# 添加到context参数里
response = urllib2.urlopen(request, context = context)

print response.read()

       5. ajax的post请求如何获得数据?

import urllib
import urllib2

url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action"

headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

# 将传递参数存储为字典类型
formdata = {
        "start":"0",
        "limit":"20"
    }
# 加密处理
data = urllib.urlencode(formdata)

request = urllib2.Request(url, data = data, headers = headers)

print urllib2.urlopen(request).read()

  【总结】

     以上内容为python爬虫基本操作方法,python版本为2.7,详细代码可从github中查找:https://github.com/whpHarper/pychon.git

 

你可能感兴趣的:(python)