import urllib.request
#发送请求
response = urllib.request.urlopen(url='http://python.org')
print(type(response))
# print(response.read().decode('utf-8'))
print(response.status)
print(response.getheaders())
200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Content-Length', '48820'), ('Accept-Ranges', 'bytes'), ('Date', 'Sat, 23 Mar 2019 09:26:41 GMT'), ('Via', '1.1 varnish'), ('Age', '970'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2140-IAD, cache-hnd18744-HND'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '2, 184'), ('X-Timer', 'S1553333202.865127,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
1.1data
#添加了data参数 成为POST请求
#data 需要转换成bytes字节流
data = bytes(urllib.parse.urlencode({'name':'tom','age':17}),encoding='utf-8')
response = urllib.request.urlopen(url='http://httpbin.org/post',data=data)
print(response.read().decode('utf-8'))
{
"args": {},
"data": "",
"files": {},
"form": {
"age": "17",
"name": "tom"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "15",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"json": null,
"origin": "124.234.225.3, 124.234.225.3",
"url": "https://httpbin.org/post"
}
1.2. timeout 设置超时请求 超过时间返回异常 urllib.error.URLError:
#timeout
response = urllib.request.urlopen(url='http://httpbin.org/get',timeout=0.1)
print(response.read().decode('utf-8'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/404NoFound/Desktop/workhose/test/3_23urllib/urllib_t.py", line 19, in
response = urllib.request.urlopen(url='http://httpbin.org/get',timeout=0.1)
File "C:\Users\404NoFound\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
\request.py", line 1345, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\404NoFound\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1319, in do_open
raise URLError(err)
urllib.error.URLError:
#Request参数
#Request 构造Headers
#Request参数 url, data=None, headers={},origin_req_host=None, unverifiable=False,method=None
data = bytes(urllib.parse.urlencode({'name':'tom','age':17}),encoding='utf-8')
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'
}
req = urllib.request.Request(url='http://httpbin.org/post',data=data,headers=headers,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
{
"args": {},
"data": "",
"files": {},
"form": {
"age": "17",
"name": "tom"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "15",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
},
"json": null,
"origin": "124.234.225.3, 124.234.225.3",
"url": "https://httpbin.org/post"
}
1.添加代理
#高级用法,添加代理,处理cookies
from urllib.request import ProxyHandler,build_opener
proxy_handle = ProxyHandler({
'http':'http://110.52.235.163:9999',
'https':'https://110.52.235.163:9999',
})
opener = build_opener(proxy_handle)
try:
response = opener.open('http://httpbin.org/get')
print(response.read().decode('utf-8'))
except:
print('erro')
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Cache-Control": "max-age=259200",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"origin": "110.52.235.163, 110.52.235.163",
"url": "https://httpbin.org/get"
}
2.Cookies
#保存cookies
import http.cookiejar,urllib.request
cookie = http.cookiejar.CookieJar()
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
for item in cookie:
print(item.name+'='+item.value)
BAIDUID=78E646426124E1F7E5A8F76BC7DA9EFE:FG=1
BIDUPSID=78E646426124E1F7E5A8F76BC7DA9EFE
H_PS_PSSID=1469_21119_28721_28558_28697_28584_28604_20719
PSTM=1553336743
delPer=0
BDSVRTM=0
BD_HOME=0
cookie = http.cookiejar.MozillaCookieJar('cookies.txt')
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.status)
cookie.save(ignore_discard=True,ignore_expires=True)
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.baidu.com TRUE / FALSE 3700820687 BAIDUID 1B02BEC8A89901648C1A184BBF5A9BB7:FG=1
.baidu.com TRUE / FALSE 3700820687 BIDUPSID 1B02BEC8A89901648C1A184BBF5A9BB7
.baidu.com TRUE / FALSE H_PS_PSSID 1425_21127_28722_28558_28697_28585_28641_26350_28603_28625_28606
.baidu.com TRUE / FALSE 3700820687 PSTM 1553337036
.baidu.com TRUE / FALSE delPer 0
www.baidu.com FALSE / FALSE BDSVRTM 0
www.baidu.com FALSE / FALSE BD_HOME 0
|
由此可见,ignore_discard的意思是即使cookies将被丢弃也将它保存下来,ignore_expires的意思是如果在该文件中cookies已经存在,则覆盖原文件写入,在这里,我们将这两个全部设置为True。运行之后,cookies将被保存到cookie.txt文件中,我们查看一下内容,附图如下
cookie = http.cookiejar.LWPCookieJar('cookies2.txt')
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.status)
cookie.save(ignore_discard=True,ignore_expires=True)
#LWP-Cookies-2.0
Set-Cookie3: BAIDUID="5BCF4AA8ED0DC6EF39DE705D5FBF8CFC:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: BIDUPSID=5BCF4AA8ED0DC6EF39DE705D5FBF8CFC; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: H_PS_PSSID=1465_21125_28721_28558_28697_28585_28518_28625_28606; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
Set-Cookie3: PSTM=1553337599; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: delPer=0; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
Set-Cookie3: BDSVRTM=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
Set-Cookie3: BD_HOME=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
#加载LWP格式的cookie
import http.cookiejar,urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookies2.txt',ignore_discard=True,ignore_expires=True)
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.read().decode('utf-8'))
# 处理异常 URLError
import urllib.error,urllib.request
try:
response = urllib.request.urlopen('https://cuiqingcai/index.htm')
except urllib.error.URLError as e:
print(e.reason)
# 处理异常 HTTPError
from urllib import error,request
try:
response = request.urlopen('https://www.baidu.com/a.htm')
except error.HTTPError as e:
print(e.reason,e.code)
1.urlparse()
from urllib.parse import urlparse
url = 'https://www.baidu.com/index.html;user?id=5#comment'
result = urlparse(url)
print(type(result))
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
url解析后分为六部分
所以一个Url格式大概为:scheme://netloc/path;params?query#fragment
2.urlunparse()
from urllib.parse import urlunparse
data = ['https', 'www.baidu.com', '/index.html','user', 'id=5', 'comment']
print(urlunparse(data))
https://www.baidu.com/index.html;user?id=5#comment
3.urlsplit()
相比缺少params
from urllib.parse import urlsplit
url = 'https://www.baidu.com/index.html;user?id=5#comment'
print(urlsplit(url))
SplitResult(scheme='https', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')
4.urlunsplit()
from urllib.parse import urlunsplit
data = ['https','www.baidu.com', '/index.html;user', 'id=5', 'comment']
print(urlunsplit(data))
https://www.baidu.com/index.html;user?id=5#comment
5.urljoin(base, url, allow_fragments=True)
from urllib.parse import urljoin
base_url='http://www.baidu.com'
url='/index.html;a?id=5'
new_url=urljoin(base='http://www.baidu.com',url='/index.html;a?id=5')
print(new_url)
new_url=urljoin(base='http://www.baidu.com',url='http://www.caixukun.com/index.html')
print(new_url)
new_url=urljoin(base='http://www.baidu.com',url='https://www.caixukun.com/index.html')
print(new_url)
new_url=urljoin(base='http://www.baidu.com/index.html',url='http://www.caixukun.com/')
print(new_url)
new_url=urljoin(base='www.baidu.com',url='?id=5')
print(new_url)
http://www.baidu.com/index.html;a?id=5
http://www.caixukun.com/index.html
https://www.caixukun.com/index.html
http://www.caixukun.com/
www.baidu.com?id=5
6.ulrencode()
序列化GET请求的参数。
from urllib.parse import urlencode
data={
'name':'tom',
'age':20,
}
base_url='https://www.baidu.com?'
new_url=base_url+urlencode(data)
print(new_url)
https://www.baidu.com?name=tom&age=20
7.parse_qs(),parse_qsl()
反序列化还原成字典、元组
from urllib.parse import parse_qs,parse_qsl
query = 'name=tom&age=20'
print(parse_qs(query))
print(parse_qsl(query))
{'name': ['tom'], 'age': ['20']}
[('name', 'tom'), ('age', '20')]
8.quote()
将中文转换成URL编码
from urllib.parse import quote
k='练习'
print(quote(k))
%E7%BB%83%E4%B9%A0
9.unquote()
将URL编码转换成中文
from urllib.parse import unquote
k='%E7%BB%83%E4%B9%A0'
print(unquote(k))
练习