一、Requests

参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request

Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求

安装方式一般采用 pip install requests

In [1]: import requests
In [2]: response=requests.get('https://api.github.com/events')
In [3]: print(response)

In [4]: response=requests.post('http://httpbin.org/post',data={'key1':'values1'})         #提交表单时使用
In [5]: print(response)

In [7]: response=requests.put('http://httpbin.org/put',data={'key1':'values1'})
In [8]: print(response)

In [10]: response=requests.delete('http://httpbin.org/delete')
In [11]: print(response)

In [13]: response=requests.head('http://httpbin.org/get')
In [14]: print(response)

In [15]: response=requests.options('http://httpbin.org/get')  
In [16]: print(response)

In [17]: payload={'key1':'value1','key2':'value2'}
In [18]: response=requests.get('http://httpbin.org/get',params=payload)   #携带参数发送get请求
In [19]: print(response)

In [20]: print(response.text)
{
  "args": {
    "key1": "value1",
    "key2": "value2"
  },
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "close",
    "Host": "httpbin.org",
    "User-Agent": "python-requests/2.18.4"
  },
  "origin": "103.215.2.233",
  "url": "http://httpbin.org/get?key1=value1&key2=value2"
}
In [22]: print(response.url)
http://httpbin.org/get?key1=value1&key2=value2
In [23]: payload={'key1':'value1','key2':['value2','value3']}
In [24]: response=requests.get('http://httpbin.org/get',params=payload)
In [25]: print(response.url)
http://httpbin.org/get?key1=value1&key2=value2&key2=value3
In [27]: response=requests.get('http://api.github.com/events')
In [28]: response.encoding              #字符集编码
Out[28]: 'utf-8'
In [29]: print(response.text)  #文件信息
[{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":......
In [30]: print(response.content)        #二进制格式信息
b'[{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":".....
In [34]: response.json()
In [36]: response.status_code           #返回状态码
Out[36]: 200
In [38]: headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.
    ...: 0.3202.75 Safari/537.36','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,p_w_picpath/apng,*/*;q=0.8'
    ...: ,'Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8','Connection':'keep-alive'}
In [39]: response=requests.get('https://api.github.com/events',headers=headers)
In [40]: print(response.headers)
{'Server': 'GitHub.com', 'Date': 'Tue, 14 Nov 2017 06:10:31 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Status': '200 OK', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '58', 'X-RateLimit-Reset': '1510642339', 'Cache-Control': 'public, max-age=60, s-maxage=60', 'Vary': 'Accept', 'ETag': 'W/"34b51a08c5a8f4fa2400dd5c0d89221b"', 'Last-Modified': 'Tue, 14 Nov 2017 06:10:31 GMT', 'X-Poll-Interval': '60', 'X-GitHub-Media-Type': 'unknown, github.v3', 'Link': '; rel="next", ; rel="last"', 'Access-Control-Expose-Headers': 'ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval', 'Access-Control-Allow-Origin': '*', 'Content-Security-Policy': "default-src 'none'", 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'X-Runtime-rack': '0.104190', 'Content-Encoding': 'gzip', 'X-GitHub-Request-Id': 'D528:C0F5:6BAAA:E4CB6:5A0A88D6'}
In [41]:
In [43]: print(response.headers['Content-Type'])
application/json; charset=utf-8
In [44]: print(response.headers.get('Content-Type'))
application/json; charset=utf-8
In [45]: url='http://www.baidu.com'
In [46]: response=requests.get(url,headers=headers)           #向baidu请求会有cookies返回,有些site没有cookies
In [47]: print(response.cookies)                              #输出整个cookies
]>
In [48]: for k,v in response.cookies.get_dict().items():      #遍历cookies内容
    ...:     print(k,v)
    ...:
H_PS_PSSID 1425_21088_24880
BDSVRTM 0
BD_HOME 0
In [49]: cookies={'c1':'v1','c2':'v2'}
In [50]: response=requests.get('http://httpbin.org/cookies',cookies=cookies)  #携带cookies发送请求
In [52]: print(response.text)
{
  "cookies": {
    "c1": "v1",
    "c2": "v2"
  }
}
In [53]: jar = requests.cookies.RequestsCookieJar()
In [54]: jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
Out[54]: Cookie(version=0, name='tasty_cookie', value='yum', port=None, port_specified=False, domain='httpbin.org', domain_specified=True, domain_initial_dot=False, path='/cookies', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
In [55]: jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
Out[55]: Cookie(version=0, name='gross_cookie', value='blech', port=None, port_specified=False, domain='httpbin.org', domain_specified=True, domain_initial_dot=False, path='/elsewhere', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
In [56]: url = 'http://httpbin.org/cookies'
In [57]: response = requests.get(url, cookies=jar)
In [58]: print(response.text)
{
  "cookies": {
    "tasty_cookie": "yum"
  }
}

Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests

In [62]: url='http://github.com'
In [64]: response=requests.get(url,allow_redirects=True)
In [65]: print(response.url)
https://github.com/
In [66]: response.history
Out[66]: []
In [69]: url = 'http://httpbin.org/post'
In [70]: files = {'file': open('test.txt', 'rb')}
In [71]: response=requests.post(url,files=files)                 #post提交时携带文件
In [72]: response.text
Out[72]: '...文件的内容...'
In [73]: response=requests.get('https://github.com', timeout=5)   #关于请求超时


import json

import requests

from io import BytesIO

from PIL import Image

#1 处理图片

r=requests.get('http://img.jrjimg.cn/2013/11/20131105065502114.jpg')
p_w_picpath=Image.open(BytesIO(r.content))  #从图片的二进制内容 生成一张图片
p_w_picpath.save('mm.jpg')

#2 Json 处理josn

r=requests.get('https://github.com/timeline.json')
print(type(r.json))
print(r.json)
print(r.text)

#3 org data 处理源数据

r=requests.get('https://timgsa.baidu.com/timg?p_w_picpath&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg')
with open('mm2.jpg','wb+') as f:
    for chunk in r.iter_content(1024):
        f.write(chunk)

#4 Form 处理表单

form={'username':'user','password':'pwd'}
r=requests.post('http://httpbin.org/post',data=form)
print(r.text)
r=requests.post('http://httpbin.org/post',data=json.dumps(form))
print(r.text)

二、通过Requests抓取豆瓣电影列表及评分

Requests 与 BeautifulSoup 模块_第1张图片

所以抓取代码如下:

import requests
from lxml import etree
sess = requests.Session()
headers={ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,p_w_picpath/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8','Connection':'keep-alive'}
for id in range(0, 250, 25):
    url = 'https://movie.douban.com/top250/?start=' + str(id)
    r = sess.get(url,headers=headers)
    r.encoding = 'utf-8'
    #fname="movie"+str(id)+".txt"
    #with open(fname,"wb+") as f:
    #    f.write(r.content)
    root = etree.HTML(r.content)  #使用lxml解析器对html文档解析
    items = root.xpath('//ol/li/div[@class="item"]')
    for item in items:
        title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
        name = title[0].encode('gb2312', 'ignore').decode('gb2312')
        rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
        rating = item.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
        print(name, rating)

三、BeautifulSoup

BeautifulSoup模块用于接收一个HTMLXML字符串,然后将其进行格式化,之后便可以使用他提供的方法进行快速查找指定元素,从而使得在HTMLXML中查找指定元素变得简单。Beautiful Soup 支持 Python 标准库中的 HTML 解析器,还支持一些第三方的解析器,如果不安装第三方解析器,Python 会使用默认的解析器。常见的解析器有:lxml, html5lib, html.parser其中lxml 解析器更加强大,速度更快,推荐安装。

from bs4 import BeautifulSoup
soup=BeautifulSoup(open('test.html'))  #这种方式适用于打开本地文件进行解析
print(soup.prettify())  #格式化输出

#1 Tag 处理tag

print(type(soup.title))
print(soup.title)
print(soup.title.name)

#2 String

print(type(soup.title.string))
print(soup.title.string)

#3 Comment

print(type(soup.a.string))
print(soup.a.string)
for item in soup.body.contents:
    print(item.name)

#4 CSS query

print(soup.select('.sister'))
print(soup.select('#link1'))
print(soup.select('head > title'))
a_s=soup.select('a')
for a in a_s:
    print(a)

例:

from bs4 import BeautifulSoup
html_doc = """
The Dormouse's story

asdf
    
        The Dormouse's story总共
        

f

    
Once upon a time there were three little sisters; and their names were     Elsfie,     Lacie and     Tillie; and they lived at the bottom of a well.