Request(url,data)
:用作url请求传参,返回的Request对象可直接传入urlopenurlretrieve(url, path)
:直接下载url网页到本地urlcleanup()
:清除缓存信息urlopen(url[, timeout])
:访问url,如果设置timeout
超时将抛出
异常。返回Response
对象用法如下
response.getcode()
:响应的状态码response.geturl()
:当前访问的urlresponse.info()
:url网页的信息response.read().decode('utf-8')
:网页源代码build_opener([proxy,request.HTTPHandler])
:构建opener
对象,该对象用法如下:
add_headers = [(),(),()...]
:列表可传入请求头元组,如headers、cookiesopen(url)
:携带头参数访问url,返回Response
对象url = 'https://www.baidu.com'
# 必须传入元组
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) ...')
cookies = ('B2352_3239RTM=207 ...')
opener = build_opener()
opener.add_headers = [headers, cookies]
response = opener.open(url)
install_opener(opener)
:将opener
对象设置为全局的,一旦设置为全局,opener.open(url)
request.urlopen(url)
url = 'https://www.baidu.com'
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) ...')
cookies = ('B2352_3239RTM=207 ...')
opener = build_opener()
opener.add_headers = [headers, cookies]
# 添加为全局opener
request.install_opener(opener)
# opener.open -> request.urlopen
response = request.urlopen(url)
ProxyHandler({"scheme":"ip:port”})
: 与代理ip相关。返回proxy
对象可传入build_opener()
quote(text)
:对字符串url编码,可以将中文转化为url字符串
params = '?name=玛尔扎哈&age=20&sex=girl'
print(quote(params))
params = '沙扬娜拉'
print(quote(params))
urlencode(form_data)
:传入字典表单生成查询字符串,会对汉字、特殊字符编码
formdata = {
'name': '阿兹尔', # 中文自动quote编码
'username': 'demanwei',
'password': '123abc',
'extraA': '',
'extraB': None,
}
print(urlencode(formdata))
parse_qs(urlencoded)
:urlencode的逆操作,将查询字符串解码成字典,会自动对转码后的中文解码
urlencoded = 'name=%E9%98%BF%E5%85%B9%E5%B0%94&username=demanwei&password=123abc&extraA=&extraB=None'
# 字典的v是一个列表
print(parse_qs(urlencoded))
# 通常参数k对应的v只有一个,可通过下面的方式得到字典对象
print({k: v[0] for k, v in parse_qs(urlencoded).items()})
urlparse(url)
:获取url的结构信息(scheme,netloc,path,params,query,fragment)
url = 'http://localhost:8080/user/get?id=12'
print(urlparse(url))
urlsplit(url)
:和urlparse基本一样,但不含params
,即(scheme,netloc,path,query,fragment)
url = 'http://localhost:8080/user/get?id=12'
print(urlsplit(url))
尽量使用urlencode而非quote!
him = '玛尔扎哈'
her = '沙扬娜拉'
raw_url = 'http://localhost:8080?him={}&her={}'.format(him, her)
print(raw_url)
# 返回每个解码后的text,不要直接操作整个url!!
new_url = 'http://localhost:8080?him={}&her={}'.format(quote(him), quote(her))
print(new_url)
# 直接返回查询字符串,以后用这个
formdata = {'him': him, 'her': her}
new_url = 'http://localhost:8080?{}'.format(urlencode(formdata))
print(new_url)
kw = request.quote('马云')
url = 'https://www.baidu.com/s?wd={}'.format(kw)
response = request.urlopen(url)
url = 'https://www.iqianyue.com/mypost'
# POST表单参数
body = {'name':'林在超','pass':'111111'}
data = parse.urlencode(body).encode('utf-8')
req = request.Requset(url, data=data) #Request对象
response = request.urlopen(req)
text = response.read().decode('utf-8')
print(text)
url = 'https://www.baidu.com'
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64)....')
cookies = ('........')
opener = request.build_opener()
opener.add_headers = [headers, cookies] # 必须传入元组
# opener设置为全局的
request.install_opener(opener)
response = opener.open(url)
print(response.getcode())
from http.cookiejar import MozillaCookieJar
from urllib import request
cookiejar = MozillaCookieJar('resource/cookie.txt')
cookiejar.load(ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)
opener.open('https://httpbin.org/cookies/set?oourse=abc')
cookiejar.save(ignore_discard=True)
try:
request.urlopen('http:/www.ncut.edu.com')
except urllib.error.URLError as e:
print(e) #
print(e.reason) # no host given
print(repr(e)) # URLError('no host given')
except Exception as e:
print(repr(e))
代理的原理:在请求网站之前,先请求代理服务器,然后让代理服务器去请求目的网站,代理服务器拿到目的网站的数据之后,再转发给我们的代码
步骤:
request.ProxyHandler({"scheme":"ip:port”})
,带入代理构建一个handler
handler
构建一个opener
opener
发送一个请求法1: 适合于代理ip稳定的情况,类似于用户代理ip的构建
ip = '175.6.6.101:5000'
proxy = request.ProxyHandler({'http': ip})
opener = request.build_opener(proxy, request.HTTPHandler)
request.install_opener(opener)
response = request.urlopen('http://www.win4000.com/meitu.html')
法2:接口调用法,适合ip不稳定
handler = request.ProxyHandler({"http":"233.241.78.43:8010"})
opener = request.build_opener(handler)
response = opener.open('http://www.win4000.com/meitu.html')
def spider_ip():
""" 此函数爬取IP代理网站(如快代理...),返回IP_pool """
ip_pool = []
pass # 爬取过程略
# [{'协议类型': 'ip:端口'}, {'协议类型': 'ip:端口'}, {'协议类型': 'ip:端口'}...]
return ip_pool
ip_pool = spider_ip()
proxy = request.ProxyHandler({'http':random.choice(ip_pool)})
opener = request.build_opener(proxy, request.HTTPHandler)
request.install_opener(opener)
response = request.urlopen('https://www.baidu.com')
# 一堆user-agent
UA_pool = [
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
]
headers = ('user-agent', random.choice(UA_pool))
opener = urllib.request.build_opener()
opener.add_headers = [headers]
request.install_opener(opener)
response = request.urlopen('https://www.baidu.com')