python爬虫之Urllib库的使用:
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8'))
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({
'word':'hello'}),encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())
b'{\n "args": {}, \n "data": "", \n "files": {}, \n "form": {\n "word": "hello"\n }, \n "headers": {\n "Accept-Encoding": "identity", \n "Content-Length": "10", \n "Content-Type": "application/x-www-form-urlencoded", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.6"\n }, \n "json": null, \n "origin": "114.249.115.80, 114.249.115.80", \n "url": "https://httpbin.org/post"\n}\n'
import urllib.request
response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
print(response.read())
b'{\n "args": {}, \n "headers": {\n "Accept-Encoding": "identity", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.6"\n }, \n "origin": "114.249.112.95, 114.249.112.95", \n "url": "https://httpbin.org/get"\n}\n'
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
if instance(e.reason, socket.timeout):
print('TIME OUT')
timeout: timed out
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(type(response))
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Content-Length', '49658'), ('Accept-Ranges', 'bytes'), ('Date', 'Wed, 06 Nov 2019 05:41:24 GMT'), ('Via', '1.1 varnish'), ('Age', '628'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2126-IAD, cache-tyo19927-TYO'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '3, 1267'), ('X-Timer', 'S1573018885.616361,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(response.read().decode('utf-8'))
from urllib import request,parse
url = 'http://httpbin.org/post'
dict = {
'name':'xiaoming'}
data = bytes(parse.urlencode(dict),encoding='utf8')
headers = {
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host':'httpbin.org'
}
req = urllib.request.Request(url=url,data=data,headers=headers,method='POST');
response = request.urlopen(req)
print(response.read().decode('utf-8'))
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "xiaoming"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "13",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"
},
"json": null,
"origin": "114.249.115.80, 114.249.115.80",
"url": "https://httpbin.org/post"
}
from urllib import request,parse
url = 'http://httpbin.org/post'
dict = {
'name':'xioaming'}
data =bytes(parse.urlencode(dict),encoding='utf8')
req = request.Request(url=url, data=data,method='POST')
req.add_header('User-Agent','Mozilla/4.0(comatibles;MSIE 5.5;Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "xioaming"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "13",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/4.0(comatibles;MSIE 5.5;Windows NT)"
},
"json": null,
"origin": "114.249.112.95, 114.249.112.95",
"url": "https://httpbin.org/post"
}
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
'http':'http://127.0.0.1:5044',
'https':'https://127.0.0.1:5044'
})
opener = urllib.request.build_opener(proxy_handle)
response = opener.open('http://www.baidu.com')
print(response.read())
import http.cookiejar,urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name+"="+item.value )
BAIDUID=5ACAF365A160B0E2B66C2D8C2466498A:FG=1
BIDUPSID=5ACAF365A160B0E255BD0F80004FC588
H_PS_PSSID=1432_21099_29073_29568_29699_29220_26350_29998
PSTM=1573025864
delPer=0
BDSVRTM=0
BD_HOME=0
import http.cookiejar,urllib.request
filename = "cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
import http.cookiejar, urllib.request
filename = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
import urllib.request, http.cookiejar
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
from urllib import request,error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:
print(e.reason)
from urllib import request,error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print(e.reason,e.code, e.headers,sep='\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
[Errno 11001] getaddrinfo failed
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('https://www.baidu.com',timeout=0.01)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print('TIME OUT')
from urllib.parse import urlparse
result = urlparse('www.baidu.com/index.htm;user?id=5#comment',scheme='https')
print(result)
ParseResult(scheme='https', netloc='', path='www.baidu.com/index.htm', params='user', query='id=5', fragment='comment')
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?=5#comment',allow_fragments=False)
print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='=5#comment', fragment='')
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')
from urllib.parse import urlunparse
data = ['http', 'www.baidu.com','index.html','user', 'a=6','comment']
print(urlunparse(data))
http://www.baidu.com/index.html;user?a=6#comment
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','https://xioaming.com/FAQ.html'))
print(urljoin('http://www.baidu.com?wd=abc','https://xioaming.com/index.php'))
http://www.baidu.com/FAQ.html
https://xioaming.com/FAQ.html
https://xioaming.com/index.php
from urllib.parse import urlencode
params = {
'name':'xiaoming',
'age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)
http://www.baidu.com?name=xiaoming&age=22