python3爬虫之Urllib库使用

python爬虫之Urllib库的使用:

# urllib中urllopen方法
# 参数1:目标url,参数2:额外数据,如post方法中的数据,
# 参数3: timeout:超时时间设置,后面的参数暂时用不到
# urllib.request.urlopen(url,data=None,[timeout,]* cafile=None,capath=Nome,cadefaulte=False,context=None)

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8'))

import urllib.parse
import urllib.request

# 构造第二个参数,使用Post方式发送请求,第二个参数要求是bytes类型
# 以post方式发送数据:word = hello,编码使用utf8,http://httpbin.org是个很好
# 请求测试站点,的到了一个json字符串
data = bytes(urllib.parse.urlencode({
     'word':'hello'}),encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())


b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "json": null, \n  "origin": "114.249.115.80, 114.249.115.80", \n  "url": "https://httpbin.org/post"\n}\n'

import urllib.request
# 使用的是get方式请求,并设置了响应超时时间,超过时间抛出异常
# 测试站点 http://httpbin.org会返回请求时的原始参数
response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
print(response.read())
b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "origin": "114.249.112.95, 114.249.112.95", \n  "url": "https://httpbin.org/get"\n}\n'

import socket
import urllib.request
import urllib.error
# 设置超时时间,访问抛出超时异常
try:
    response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
    if instance(e.reason, socket.timeout):
        print('TIME OUT')
    
timeout: timed out

# 响应
# 响应类型,获取响应类型
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(type(response))


# 获取响应信息中的状态码,响应头
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
# 获取响应状态
print(response.status)
# 获取响应头,是一个list
print(response.getheaders())
# 获取响应头list中的一个项
print(response.getheader('Server'))
200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Content-Length', '49658'), ('Accept-Ranges', 'bytes'), ('Date', 'Wed, 06 Nov 2019 05:41:24 GMT'), ('Via', '1.1 varnish'), ('Age', '628'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2126-IAD, cache-tyo19927-TYO'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '3, 1267'), ('X-Timer', 'S1573018885.616361,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx

# 获取响应对象的内容
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
# 因为response read读出的是二进制流,所以要解码,并指定解码方式为utf-8
print(response.read().decode('utf-8'))

# 使用request对象,可以在对象中添加丰富的参数,并且使用post方式来发起请求
from urllib import request,parse

# 构造url
url = 'http://httpbin.org/post'

# 构造data
dict = {
     'name':'xiaoming'}
# 对数据进行编码并转换成字节
data = bytes(parse.urlencode(dict),encoding='utf8')

# 构造请求头
headers = {
     
    'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
    'Host':'httpbin.org'
}
# 构造req对象
req = urllib.request.Request(url=url,data=data,headers=headers,method='POST');
# 发起请求
response = request.urlopen(req)
# 打印对象
print(response.read().decode('utf-8'))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "xiaoming"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "13", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"
  }, 
  "json": null, 
  "origin": "114.249.115.80, 114.249.115.80", 
  "url": "https://httpbin.org/post"
}

# 使用request对象.add_header()方法添加请求头,
# 当header中有多个键值对的时候,可以用for循环调用add_headdr()
# 方法添加add_headr()方法
from urllib import request,parse

# 构造url
url = 'http://httpbin.org/post'
# 构造data
dict = {
     'name':'xioaming'}
# 数据编码并转换为bytes
data =bytes(parse.urlencode(dict),encoding='utf8')
# 构造请求对象
req = request.Request(url=url, data=data,method='POST')
# 添加请求头
req.add_header('User-Agent','Mozilla/4.0(comatibles;MSIE 5.5;Windows NT)')
# 发起请求
response = request.urlopen(req)
# 输出信息
print(response.read().decode('utf-8'))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "xioaming"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "13", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(comatibles;MSIE 5.5;Windows NT)"
  }, 
  "json": null, 
  "origin": "114.249.112.95, 114.249.112.95", 
  "url": "https://httpbin.org/post"
}

# Handler
# 设置代理
# 使用代理前提是有代理软件运行,否则发生异常,使用代理可以
# 避免爬虫IP被封
import urllib.request
# 构建代理对象
proxy_handler = urllib.request.ProxyHandler({
     
    'http':'http://127.0.0.1:5044',#自己使用代理的软件端口
    'https':'https://127.0.0.1:5044'
})
# 构建opener对象
opener = urllib.request.build_opener(proxy_handle)

# 发起请求
response = opener.open('http://www.baidu.com')
# 打印信息
print(response.read())


# Cookie
# 保存在客户端,维持登录状态的信息,记录用户身份的文本文件
# 设置Cookie 后可以爬取已经登录后的信息
import http.cookiejar,urllib.request
# 构造cookie对象
cookie = http.cookiejar.CookieJar()
# 构造handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 构造opener对象
opener = urllib.request.build_opener(handler)
# 发起请求
response = opener.open('http://www.baidu.com')
# 打印cookie内容
for item in cookie:
    print(item.name+"="+item.value )

BAIDUID=5ACAF365A160B0E2B66C2D8C2466498A:FG=1
BIDUPSID=5ACAF365A160B0E255BD0F80004FC588
H_PS_PSSID=1432_21099_29073_29568_29699_29220_26350_29998
PSTM=1573025864
delPer=0
BDSVRTM=0
BD_HOME=0

# 将访问的站点的cookie保存为文本文件,如果cookie没有失效的情况下
# 供使用的时候再次使用
import http.cookiejar,urllib.request
# 构造cookie对象
filename = "cookie.txt"
# 保存为MozillaCookieJar格式(谷歌浏览器cookie格式)
cookie = http.cookiejar.MozillaCookieJar(filename)
# 构造handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 构造opener对象
opener = urllib.request.build_opener(handler)
# 发起请求
response = opener.open('http://www.baidu.com')
# 保存cookie
cookie.save(ignore_discard=True,ignore_expires=True)


# 使用其他的cookie保存格式:LWP格式
import http.cookiejar, urllib.request
# 构造cookie对象
filename = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
# 构造handle对象
handle = urllib.request.HTTPCookieProcessor(cookie)
# 构造opener对象
opener = urllib.request.build_opener(handle)
# 发起请求
response = opener.open('http://www.baidu.com')
# 保存cookie
cookie.save(ignore_discard=True,ignore_expires=True)

# 读取已经保存的cookie文本,使用这个cookie来访问
# 注意使用哪种格式保存cookie,就使用哪种方式来读取
import urllib.request, http.cookiejar
# 构造cookie对象,并指定格式为LW
cookie = http.cookiejar.LWPCookieJar()
# 读取cookie文件
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
# 构造handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 构造opener对象
opener = urllib.request.build_opener(handler)
# 发起请求
response = opener.open('http://www.baidu.com')
# 打印对象
print(response.read().decode('utf-8'))


# 异常处理
# urlib中的异常处理模块
# 异常处理中有两种,URLError和HTTPError,HTTPError是
# URLError的子类,一般捕获这两种Error就行
from urllib import request,error
# 没有捕获异常程序会终止
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:
    print(e.reason)  # 打印异常原因

from urllib import request,error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
# 打印子类异常信息
except error.HTTPError as e:
    # 有三种异常信息
    print(e.reason,e.code, e.headers,sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')
[Errno 11001] getaddrinfo failed

import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.baidu.com',timeout=0.01)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason,socket.timeout):
        print('TIME OUT')
        
    



# url解析
# 使用urlparse模块
# 将传入的url按标准分为6部分
# urlib.parase.urlparse(urlstring,shceme='',allow_fragments=True)

from urllib.parse import urlparse

# 解析
# 参数2:scheme:在参数中未指定协议的情况下使用scheme指定的协议
result = urlparse('www.baidu.com/index.htm;user?id=5#comment',scheme='https')
print(result)

  
ParseResult(scheme='https', netloc='', path='www.baidu.com/index.htm', params='user', query='id=5', fragment='comment')

from urllib.parse import urlparse
# 在url链接中指定了协议,即使scheme参数中指定协议也不在使用
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

# 参数三 allow_fragments,url中'#'后面的锚链接
# 参数3:指定allow_fragments=Flase时,解析结果中fragment的
# 内容会向前拼接,结果中fragment内容为空
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?=5#comment',allow_fragments=False)
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='=5#comment', fragment='')

# 如果url中,前面query或者params为空的情况,fragment会向前拼接
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

# urlunparse
# urlparse的反函数,将数据拼接成完整的url
from urllib.parse import urlunparse
data = ['http', 'www.baidu.com','index.html','user', 'a=6','comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?a=6#comment

# urljoin方法
from urllib.parse import urljoin
# urljoin方法将两个参数按照parse的方式切分6部分,以后面参数为
# 基准,后面有用后面参数的,没有使用第一个参数中对应的部分,进行
# 拼接成对应的url
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','https://xioaming.com/FAQ.html'))
print(urljoin('http://www.baidu.com?wd=abc','https://xioaming.com/index.php'))

http://www.baidu.com/FAQ.html
https://xioaming.com/FAQ.html
https://xioaming.com/index.php

# urlencode
# 此方法将字典的数据转换为url中get方式的参数
from urllib.parse import urlencode

params = {
     
    'name':'xiaoming',
    'age':22

}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)
http://www.baidu.com?name=xiaoming&age=22

你可能感兴趣的:(python爬虫,python3爬虫,urllib库)