urllib库详解

urllib是Python中内置的HTTP请求库,提供了一系列用于操作URL的功能,分为

  • urllib.request 请求模块
  • urllib.error 异常处理模块
  • urllib.parse url解析模块
  • urllib.robotparser robots.txt解析模块

urlopen(url, data=None, [timeout,]*, cafile=None, capath=None, cadefault=False, context=None)

第一个参数url是必须传入的,第二个参数是post请求需要传入的数据,第三个参数是请求超时设置,其他参数可以根据需要传入。

from urllib import request
response= request.urlopen('https://www.baidu.com') #返回的是一个response对象,需要调用read方法来获取内容
print(response.read().decode('utf-8')) #read方法返回的是bytes类型的数据,所以需要decode解码

只需要两行代码就可以直接获取到百度网页的源码,非常方便

使用data参数:
from urllib import request, parse

data = bytes(parse.urlencode({'file': 'text'}), encoding = 'utf-8')
response = request.urlopen('http://httpbin.org/post', data = data)
print(response.read().decode('utf-8'))
使用timeout参数:
from urllib import request, error
import socket

try:
	response = request.urlopen('http://httpbin.org/get', timeout = 0.1)
except error.URLError as e:
	if isinstance(e.reason, socket.timeout):
		print('Time Out!') 

响应

响应类型
from urllib import request

response = request.urlopen('https://www.baidu.com')
print(type(response)) #
状态码和响应头
from urllib import request

response = request.urlopen('https://www.baidu.com')
print(response.status) #200表示请求成功
print(response.getheaders()) #获取响应头所有信息
print(response.getheader('xxx')) #获取响应头的某个值

构造Request对象(用于发送更复杂的请求)

from urllib import request, parse

url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': 'httpbin.org'
}
dict = {
	'name': 'albert'
}
data = bytes(parse.urlencode(dict), encoding = 'utf-8')
req = request.Request(url = url, data = data, headers = headers, method = 'POST')
'''
也可以使用add_header()来添加响应头
req = request.Request(url = url, data = data, method = 'POST')
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36')
'''
response = request.urlopen(req)
print(response.read().decode('utf-8'))

Handler

代理(防止IP受限)
from urllib import request

proxy_handler = request.ProxyHandler({
    'http': 'http://127.0.0.1:1087', #需要自行修改
    'https': 'https://127.0.0.1:1087'
})
opener = request.build_opener(proxy_handler)
response = opener.open('https://www.baidu.com')
print(response.read().decode('utf-8'))
Cookie(爬虫中用于维持登录状态)
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name + '=' + item.value)
保存cookie
import http.cookiejar, urllib.request

filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) #两种保存方式
#cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie) 
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True) #ignore_discard表示cookie将被丢弃也保存, ignore_expires表示cookie失效也保存且若cookie文件存在则覆盖
读取cookie文件
import http.cookiejar, urllib.request

filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar()#使用相应的保存方式读取
#cookie = http.cookiejar.LWPCookieJar()
cookie.load(filename, ignore_discard = True, ignore_expires = True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

url lib.error定义了三种error:
  1. URLError:只有reason属性
  2. HTTPError:有三个属性分别为code、reason、headers。是URLError的子类,所以一般先捕捉HTTPError,然后再捕捉URLError
  3. ContentTooShortError:文件下载不完全时会捕捉该异常
获取错误原因
from urllib import request, error

try:
    response = request.urlopen('http://www.bai.com.cn') #随意输入一个不存在的网址
except error.HTTPError as e:
    print(e.reason) #Not Found
异常处理优先级
from urllib import request, error

try:
    response = request.urlopen('http://www.bai.com.cn') #随意输入一个不存在的网址
except error.HTTPError as e:
    print(e.code, e.reason, e.headers,sep = '\n')
except error.URLError as e:
    print(e.reason)
else:
    print('successful!')

URL解析

urlparse(用于拆分URL)
urllib.parse.urlparse(urlstring, scheme = '', allow_fragments = True)
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(result)   #ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme = 'https')   
print(result)#ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme = 'https')
print(result)   #ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments = False)
print(result)   #ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')

result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments = False)
print(result)   #ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')
urlunparse(用于拼接URL)
from urllib.parse import urlunparse

data = ['http', 'www.baidu.com', 'index.html', 'user', 'id=5', 'comment']
print(urlunparse(data)) #http://www.baidu.com/index.html;user?id=5#comment
urljoin( 用于URL拼接,后面的URL会覆盖前面的URL)
from urllib.parse import urljoin

print(urljoin('http://www.baidu.com', 'join.html')) #http://www.baidu.com/join.html
print(urljoin('http://www.baidu.com', 'https://youku.com/haokan.html')) #https://youku.com/haokan.html
urlencode(用于将请求数据转为字节数据)
from urllib.parse import urlencode

params = {
    'name': 'albert',
    'gender': '男'
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url) #http://www.baidu.com?name=albert&gender=%E7%94%B7

robotparser

用于解析robots.txt文件的,用的比较少,需要了解的可以去urllib官网查看

你可能感兴趣的:(爬虫)