urllib 是一个用来处理网络请求的 python 标准库,包含四个模块
主要负责 构造和发起 网络请求,并在其中添加Headers,Porxy等
request.urlopen
方法
会返回一个 response 对象
urllib.request.urlopen(
url,
data=None,
[timeout,]*,
cafile=None,
capath=None,
Cadefault=False,
context=None
)
from urllib import request
# 1. 网站响应超时报错
request.urlopen('http://httpbin.org/get', timeout=0.1)
# 2. 发送一个GET请求
response = request.urlopen(url='http://httpbin.org/get')
# 2. 发送一个POST请求
response2 = request.urlopen(
url='http://httpbin.org/post',
data=b'username=Xyb&password=123456'
)
Response
对象
由上面的 request 方法所返回
# 1. getcode() 返回状态码
print(response.getcode())
--> 200
# 2. info() 获取响应头信息
print(response.info())
--> Access-Control-Allow-Credentials: true
Access-Control-Allow-Origin: *
Content-Type: application/json
Date: Mon, 03 Jun 2019 05:13:13 GMT
Referrer-Policy: no-referrer-when-downgrade
Server: nginx
X-Content-Type-Options: nosniff
X-Frame-Options: DENY
X-XSS-Protection: 1; mode=block
Content-Length: 222
Connection: Close
# 3. read() 获取响应返回的数据,只能 read 一次(响应数据储存在内存中,读取一次就没了)
print(response.read())
--> b'{\n "args": {}, \n "headers": {\n "Accept-Encoding": "identity", \n "Host": "httpbin.org", \n
"User-Agent": "Python-urllib/3.6"\n }, \n "origin": "192.246.58.12, 192.1480.22.15", \n "url":
"https://httpbin.org/get"\n}\n'
# 4. geturl() 获取访问的url
print(response.geturl())
--> http://httpbin.org/get
# 5. readline() 读取一行数据
print(response.readline())
--> b'{\n'
Request
对象,构建自定义的Request对象,第一种方法构建的不行
通过 request.Request 创建
请求头添加
通过urllib发送的请求会有一个默认的Headers:‘User-Agent’:‘Python-urllib/3.6’,指明请求时由urllib发送的,所以会遇到一些验证’User-Agent’的网站时,需要我们自定义Headers把自己伪装起来
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E5%9B%BE%E7%89%87'
} # 构建请求头(User-Agent:用户代理,Referer:请求原地址,图片防盗链技术(解决403))
req =request.Request(
url='http://img4.imgtn.bdimg.com/it/u=2153937626,1074119156&fm=26&gp=0.jpg',
headers=headers
) # 构建Request对象
response = request.urlopen(req)
with open(r'text.jpg', 'wb') as f:
f.write(response.read())
设置代理
from urllib import parse
parse.quote()
方法,解决汉字传参问题
url = 'http://httpbin.org/get?username={}'.format(parse.quote('蟒蛇'))
print(url) # http://httpbin.org/get?username=%E9%82%A2%E7%9B%8A%E6%96%8C
request.urlopen(url)
parse.unquote()
反编码方法,把url反编码回来
print(parse.unquote('http://httpbin.org/getusername=%E9%82%A2%E7%9B%8A%E6%96%8C'))
http://httpbin.org/get?username=邢益斌
parse.urlencode()
把字典一次性编码
args = {
'username': '邢邢邢',
'password': '123456'
}
url_args = parse.urlencode(args)
url = 'http://httpbin.org/get?{}'.format(url_args)
print(url)
--> http://httpbin.org/getusername=%E9%82%A2%E9%82%A2%E9%82%A2&password=123456
print(url_args)
--> username=%E9%82%A2%E9%82%A2%E9%82%A2&password=123456
parse.parse_qs()
反编码方法,把url反编码回来
from urllib import error
try:
request.Request('https://www.jianshu.com')
except error.HTTPError as e:
print(e.code) # 错误码
print(e.reason) # 错误的原因
print(e.headers) # 错误请求头
例:https://www.taobao.com/robots.txt
安装:pip install urllib3
urllib3 是一个基于 python3 的功能强大,友好的 http 客户端
越来越多的 python 应用开始采用urllib3 他提供了很多 python 标准库里没有的重要功能
优点:保证了线程的安全,保持了socket连接时间,100%测试率等等
导入 urllib3 库
import urllib3
实例化一个PoolManage对象构造请求,这个对象处理了连接池和线程安全的所有细节
http = urllib3.PoolManager
用request()
方法发送一个请求
req = http.request('GET', 'http://httpbin.org/robots.txt')
可以用request()
方法发送任意请求,发送post请求
req = http.request(
'POST',
'http://httpbin.org/post',
fields={'hello': 'world'} # post请求通过这个参数传参
)
与 urlllib 里面的响应内容相同
import urllib3
import json
# 实例化一个连接池
http = urllib3.PoolManager()
response = http.request('get', 'http://httpbin.org/get')
print(response.status) # 打印状态码
print(response.headers) # 打印响应头
print(json.loads(response.data.decode('utf-8')))
# 先把二进制转换成字符串,在把json数据转换成python的字典格式
-->
200
HTTPHeaderDict({'Access-Control-Allow-Credentials': 'true', 'Access-Control- Allow-Origin': '*', 'Content-Type': 'application/json', 'Date': 'Mon, 03 Jun 2019 08:50:23 GMT', 'Referrer-Policy': 'no-referrer-when-downgrade', 'Server':
'nginx', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'DENY', 'X- XSS-Protection': '1; mode=block', 'Content-Length': '182', 'Connection': 'keep-alive'})
{'args': {}, 'headers': {'Accept-Encoding': 'identity', 'Host': 'httpbin.org'}, 'origin': '183.246.28.65, 183.246.28.65', 'url':
'https://httpbin.org/get'}
可以通过 json 把数据转换成python的字典格式
import urllib3
import json
http = urllib3.PoolManager()
r = http.request('GET', 'http://httpbin.org/get')
print(json.loads(r.data.decode('utf-8')))
--> {'args': {}, 'headers': {'Accept-Encoding': 'identity', 'Host':
'httpbin.org'}, 'origin': '183.246.28.65, 183.246.28.65', 'url':
'https://httpbin.org/get'}
通过 for 循环处理大数据
响应返回的数据都是字节类型,通过stream来处理
import urllib3
http = urllib3.PoolManager()
r = http.request('Get', 'http://httpbin.org/bytes/1024', preload_content=False)
for chunk in r.stream(32):
print(chunk)
或者当做一个文件对象来处理
import urllib3
http = urllib3.PoolManager()
r = http.request('Get', 'http://httpbin.org/bytes/1024', preload_content=False)
for line in r:
print(line)
IP代理:把请求发到这个IP,这个IP帮你发送请求,别人发现不了你(干坏事)
import urllib3
proxy = urllib3.ProxyManager('http://180.76.111.69:3128')
res = proxy.request('get', 'http://httpbin.org/ip')
print(res.data)
request 方法里面的headers参数,传的数据是字典
import urllib3
http = urllib3.PoolManager()
r = http.request('Get', 'http://httpbin.org/headers', headers={'key': 'value'})
GET 请求时,在 request 里面的 fields参数 里面传入字典,自动解析中文
http = urllib3.PoolManager()
response = http.request(
'GET',
'http://httpbin.org/get',
fields={'name': '邢邢邢', 'password': '密码密码'}
)
print(json.loads(response.data.decode('utf-8'))['args'])
--> {'name': '邢邢邢', 'password': '密码密码'}
POST 和 PUT 请求不能使用上面的方法,需要使用 urllib.parse 里面的
parse.urlencode()
from urllib.parse import urlencode
http = urllib3.PoolManager()
encode_args = urlencode({'username':'满怀心'})
url = 'http://httpbin.org/post?' + encode_args
response = http.request('POST', 'url')
print(json.loads(response.data.decode('utf-8'))['args'])
发送form表单数据
http = urllib3.PoolManager()
response = http.request(
'POST',
'http://httpbin.org/post',
fields={'username': 'xyb', 'password': '123456'}
)
print(json.loads(response.data.decode('utf-8'))['form'])
--> {'password': '123456', 'username': 'xyb'}
发送json格式的数据
import urllib3
import json
http = urllib3.PoolManager()
data = {'username': 'xyb', 'password': '123456'}
encoded_data = json.dumps(data).encode('utf-8')
response = http.request(
'POST',
'http://httpbin.org/post',
body=encoded_data,
headers={'Content-Type': 'application/json'}
)
print(json.loads(response.data.decode('utf-8'))['json'])
--> {'password': '123456', 'username': 'xyb'}
下载百度图片首页面的所有图片
import urllib3
import re
import json
page_url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E5%9B%BE%E7%89%87'
# 随便找一张图片,复制url
# 图片是浏览器下载下来的
# 图片的url也是资源,会比图片更早的下载回来
# 所以我们直接把整个html下载下来,通过提取图片的url遍历下载
# 下载html
http = urllib3.PoolManager()
res = http.request('get', page_url)
html = res.data.decode('utf-8') # ctrl+f 查询charset
# 提取img_urls
img_urls = re.findall(r'"thumbURL":"(.*?)"', html)
# 构造请求头,防止请求图片的时候被403
headers = {
'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E5%9B%BE%E7%89%87'
}
# 遍历下载
for index, img_url in enumerate(img_urls):
img_res = http.request('get', img_url, headers=headers)
img_name = '{}.{}'.format(index, img_url.split('.')[-1])
with open(r'img/{}'.format(img_name), 'wb') as f:
f.write(img_res.data)
ajax_url = 'http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn={}&rn=30&gsm=3c&1559557834013='
for i in range(1, 4):
url = ajax_url.format(i*30)
print(url)
res = http.request('get', url)
# 将我们返回回来的json字符串变成字典
data = json.loads(res.data.decode('utf-8'))
img_info_list = data['data']
for img in img_info_list:
print(img['thumbURL'])