1、urllib库
(1)请求与响应
(2)代理设置
(4)异常处理
(5)URL解析
2、requests库
(1)请求和响应
(2)高级操作
请求模块 | urllib.request |
异常处理模块 | urllib.error |
url解析模块 | urllib.parse |
robotx.txt模块 | urllib.robotparse |
函数原型:
urllib.request.urlopen(url, data = None, [timeout,]*, cafile = None, capath = None, cadefault = False,context = None)
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8')) # 获取响应体的内容
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding = 'utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data = data)
print(response.read())
import urllib.request
response = urllib.request.urlopen('https://www/python.org')
print(type(response)) # response的类型
print(response.status) # response的状态码
print(response.getheaders()) # response的响应头
print(response.getheader('Server')) # response响应头的某个特定信息
设置请求:
import urllib.request
request = urllib.request.Request('https://python.org')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
设置请求方式、请求头、form-data数据:
from urllib.request,parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/4.0(compatible; MSIE 5.5; Windows NT)',
'Host':'httpbin.org'
}
dict = {'name': 'Germey'}
data = bytes(parse.urlencode(dict), encoding = 'utf-8')
req = request.Request(url = url, data = data, headers = headers, method = 'POST')
response = resuqest.urlopen(req)
print(response.read().decode('utf-8'))
import urllib.request
proxy_handler = urllib.request.ProxyHandler(
{
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
}
)
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('https://www.baidu.com')
print(response.read())
(3)Cookie(Cookie是客户端保存的用于记录用户身份的文本文件)
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name + '=' + item.value)
import http.cookiejar, urllib.request
filename = cookie.txt
cookie = http.cookiejar.MozillaCookieJar(filename)
# cookie = http.cookiejar.LWPCookieJar(filename) # 另外一种cookie的保存格式
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True)
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar(filename)
cookie.load('cookie.txt', ignore_discard = True, ignore_expires = True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
from urllib import request,error
try:
response = urllib.request.urlopen('http://cuiqingcai/index.html')
except error.URLError as e:
print(e.reason)
from urllib import request,error
try:
response = urllib.request.urlopen('http://cuiqingcai/index.html')
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep = '\n') # 打印子类HTTPError异常
except error.URLError as e:
print(e.reason) # 打印父类URLError异常
else:
print('Request Successful')
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('http://httpbin.org/get', timeout = 0.1)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason, socket.timeout):
print('TIME OUT')
scheme参数:协议类型,如果urlstring中没有协议类型,则会对传入的协议类型进行填充,如果有,则传入的不生效;
allow_fragments:锚点的链接,如果为False,则会把fragments拼接到前面的参数;
from urllib.parse import urlparse
# 对url进行分割,分割成为6个内容,分别是协议类型scheme,域名netloc,路径path,参数params,查询参数query和fragment
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result), result)
from urllib.parse import urlunparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] # 用数组指定数据
print(urlunparse(data))
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com', 'FAQ.html'))
# 结果为:http://www.baidu.com/FAQ.html
print(urljoin('http://www.baidu.com', 'http://cuiqingcai.com/FAQ.html'))
# 结果为:http://cuiqingcai.com/FAQ.html
以后面第二个URL作为主要URL,如果第二个URL没有的参数,才从第一个URL中提取
from urllib.parse import urlencode
params = {
'name':'germey',
'age': 22
}
base_url = 'http://www.baidu.com'
url = base_url + urlencode(params)
print(url)
# 结果为:http://www.baidu.com?name=germey&age=22
requests基于urllib,比urllib更加方便,可以节约我们的大量工作。
函数原型:res = requests.get(url, params = None, **kwargs)
import requests
response = requests.get('http://httpbin.org/get')
print(response.text)
注意:requests库会对请求中所携带的参数进行urlencode编码,但是不会对url进行编码
import requests
data = {
'name':'germey',
'age':22
}
response = requests.get('http://httpbin.org/get', params=data)
# response = requests.get('http://httpbin.org/get?name=germey&age=22')
print(response.text)
import requests
response = requests.get('http://httpbin.org/get')
print(type(response.text))
print(response.json())
print(type(response.json()))
import requests
import json
response = requests.get('https://github.com/favicon.ico')
print(type(response.text), type(response.content))
print(response.text)
print(response.content)
with open('favicon.ico', wb) as f:
f.write(response.content)
import requests
headers = {
'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get('https://www.zhihu.com/explore', headers = headers)
print(response.text)
import requests
data = {'name':'germey', 'age':'22'}
headers = {
'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.post('http://httpbin/post', data =data, headers = headers)
print(response.json())
import requests
response = requests.get('http://www.jianshu.com')
print(type(response.status_code), response.status_code) # 状态码
print(type(response.headers), response.headers) # 响应头
print(type(response.cookies), response.cookies) # cookies
print(type(response.url), response.url) # url
print(type(response.history), response.history) # 访问历史
import requests
response = requests.get('https://www.jianshu.com')
exit() if not response.status_code == requests.codes.ok else print('Request Successfully')
# 或者可以为
exit() if not response.status_code == 200 else print('Request Successfully')
import requests
files = {'file': open('favicon.ico', 'rb')}
response = requests.post('http://httpbin.org/post', files = files)
print(response.text)
import requests
response = requests.get('https://www.baidu.com')
print(response.cookies)
for key,value in response.cookies.items():
print(key + '=' + value)
import requests
# 使用会话来同时发起两次请求,就可以保证是同一个cookie
sess = requests.Session() # 模拟一个会话
sess.get('http://httpbin.org/cookies/set/number/123456789') # 设置cookie
response = sess.get('http://httpbin.org/cookies') # 获取cookie信息
print(response.text)
访问一个网站时,如果网站的SSL协议是不合法的,那么浏览器会进行拦截,而使用爬虫时也无法进行爬取数据。但是通过设置requests中的一个参数,可以避免进行证书验证。
import reqquest
from requests.packages import urllib3
urllib3.disable_warnings() # 消除警告信息
response = requests.get('https://www.12306.cn', verify = False)
print(response.status_code)
import requests
proxies = {
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
}
# 如果代码服务器有用户名和密码
#proxes = {
# 'http':'http://user:[email protected]:9743',
# 'https':'https://user:[email protected]:9743'
#}
response = requests.get('https://www.taobao.com', proxies = proxes)
print(response.status_code)
import requests
response = requests.get('https://www.taobao.com', timeout = 1) # 规定时间内没有应答则会抛出异常
print(response.status_code)
import requests
from requests.auth import HTTPBasicAuth
response = requests.get('http://120.27.34.24:9001', auth = HTTPBasicAuth('user', '123'))
# 或者为
# response = requests.get('http://120.27.34.24:9001', auth = ('user', '123'))
print(response.status_code)
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestsException
try:
response = requests.get('http://httpbin.org/get', timeout = 0.5)
print(response.status_code)
except ReadTimeout: # 捕获子类异常
print('Timeout')
except ConnectionError: # 捕获子类异常
print('Http error')
except RequestException: # 捕获父类异常
print('Error')