Python爬虫—3第三方库_1_requests_入门

1、request基本请求
     requests模块提供所有的http请求方式:
>>> import requests
>>> r = requests.get("http://httpbin.org/get")
>>> r = requests.post("http://httpbin.org/post", data={"key":"value"})
>>> r = requests.put("http://httpbin.org/put", data={"key":"value"})
>>> r = requests.delete("http://httpbin.org/delete")
>>> r = requests.head("http://httpbin.org/get")
>>> r = requests.options("http://httpbin.org/get")

2、GET请求
      (1)通过params为GET请求添加参数
>>> payload = {"key1":"value1", "key2":"value2"}
>>> r = requests.get("http://httpbin.org/get", params = payload)
>>> r.url
u'http://httpbin.org/get?key2=value2&key1=value1'
      (2)获取网页内容
>>> r = requests.get("https://api.github.com/events")
>>> r.text
u'[{"repository":{"open_issues":0,"url":"https://github.com/...
     (3)获取网页编码方式
>>> r.encoding
'utf-8'
     (4)获取二进制编码的网页内容
>>> r = requests.get("https://api.github.com/events")
>>> r.content
b'[{"repository":{"open_issues":0,"url":"https://github.com/...
     (5)获取json数据
>>> r = requests.get("https://api.github.com/events")
>>> r.json()
[{u"repository":{u"open_issues":0,u"url":"https://github.com/...
     (6)获取原始套接字响应
>>> r = requests.get("https://github.com/timeline.json", stream=True)
>>> r.raw
>>> r.raw.read(10)
'{"message"'

3、POST请求
      (1)通过data为POST请求添加参数(表单提交数据)
>>> payload = {"key1":"value1", "key2":"value2"}
>>> r = requests.post("http://httpbin.org/post", data = payload)
>>> print r.text
{
  "args": {},
  "data": "",
  "files": {},
  "form": {
    "key1": "value1",
    "key2": "value2"
  },
  ...
  "url": "http://httpbin.org/post"
}
      (2)通过post传送json数据
>>> import json
>>> import requests
>>> url = "http://httpbin.org/post"
>>> payload = {"key1": "value1", "key2": "value2"}
>>> r = requests.post(url, data=json.dumps(payload))
>>> r = requests.post(url, json=payload)     #可以用这种方式替换上面一行代码
>>> print r.text
{
  "args": {},
  "data": "{\"key2\": \"value2\", \"key1\": \"value1\"}",
  "files": {},
  "form": {},
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Content-Length": "36",
    "Host": "httpbin.org",
    "User-Agent": "python-requests/2.10.0"
  },
  "json": {
    "key1": "value1",
    "key2": "value2"
  },
  "origin": "***.***.***.***",
  "url": "http://httpbin.org/post"
}
     (3)通过files参数上传文件
>>> url = "http://httpbin.org/post"                 
>>> files = {"file": open("test.txt", "rb")}
>>> r = requests.post(url, files=files)
>>> print r.text
{
  "args": {},
  "data": "",
  "files": {
    "file": "hello world!\n"
  },
  "form": {},
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Content-Length": "157",
    "Content-Type": "multipart/form-data; boundary=5a8b6be7cb4f4849ba3e2ab895613061",
    "Host": "httpbin.org",
    "User-Agent": "python-requests/2.10.0"
  },
  "json": null,
  "origin": "124.205.21.146",
  "url": "http://httpbin.org/post"
}
4、响应状态码与超时
      (1)获取相应状态码
>>> r = requests.get("http://httpbin.org/get")
>>> r.status_code
200
     (2)设置连接超时
          >>> r = requests.get("http://www.baidu.com", timeout=0.001)
5、HEADERS
      (1)自定义请求headers
>>> url = "http://httpbin.org/headers"
>>> headers = {"user-agent": "my-app/0.0.1", "Referer": url}   
>>> r = requests.get(url, headers = headers)
>>> print r.text
{
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Host": "httpbin.org",
    "Referer": "http://httpbin.org/headers",
    "User-Agent": "my-app/0.0.1"
  }
}
     (2)获取响应headers
>>> r.headers
{'Content-Length': '193', 'Server': 'nginx', 'Connection': 'keep-alive', 'Access-Control-Allow-Credentials': 'true', 'Date': 'Thu, 12 May 2016 02:31:50 GMT', 'Access-Control-Allow-Origin': '*', 'Content-Type': 'application/json'}
6、COOKIES
     (1)请求时将cookies设置在headers中
          headers = {"User-Agent": "my-app/0.0.1", "Cookie": "LYB2cSess=6b41c7cc8ccf46c; domain-filter-bypass=lol"}
>>> url = "http://httpbin.org/headers"
>>> r = requests.get(url, headers = headers)
在headers中的cookies值格式:"key1=value1; key2=value2; key3=value3"
     (2)请求时,传递cookies参数,值是一个dict类型
>>> url = "http://httpbin.org/cookies"
>>> cookies = dict(cookies_are="working")
>>> r = requests.get(url, cookies=cookies)
>>> print r.text
{
  "cookies": {
    "cookies_are": "working"
  }
}
     (3)获取响应中服务器返回的cookies
>>> url = "http://www.baidu.com"
>>> r = requests.get(url)
>>> r.cookies
<[Cookie(version=0, name='BAIDUID', value='4FB56FB9A82B5A28CB0EE6034285F7E4:FG=1', port=None, port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=3610505635, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), ......]>
>>> r.cookies["BAIDUID"]
'4FB56FB9A82B5A28CB0EE6034285F7E4:FG=1'
r.cookies:是一个requests.cookies.RequestsCookieJar对象(结构类似dict,可以使用dict函数访问其内容)

你可能感兴趣的:(Python爬虫—3第三方库_1_requests_入门)