目录
一、urllib库的基本使用
二、一个类型和六个方法
三、用urllib下载
1、下载网页
2、下载图片
3、下载视频
四、urllib请求对象的定制
1、url的组成 https://www.baidu.com/s?wd=参数
2、UA反爬
五、编解码
1、get请求方式之urllib.parse.quote()
2、get请求方式之urllib.parse.urlencode()
3、post请求方式
六、ajax的get请求方法
七、ajax爬取多页网页
八、ajax的post请求
九、urllib异常 urllib.error.
十一、Handler处理器
十二、urllib的代理
#使用urllib库来获取百度首页的源码
(1)、定义一个URL
url = "http://www.baidu.com"
(2)、模拟浏览器向服务器发送请求response响应
response = urllib.request.urlopen(url)
(3)、获取响应中的页面的源码content内容的意思
#read方法 返回的是字节形式的二进制数据
用decode('编码的格式') 解码 二进制-->字符串
content = response.read().decode('utf-8')
(4)、打印数据
print(content)
1、一个类型
#response是HTTPResponse的类型
response = ullib.request.urlopen(url)
print(type(response))
2、六个方法
按照一个字节一个字节去读
content = response.read()
print(content)
读取一行
content = response.readline()
print(content)
读取很多行
content = response.readlines()
print(content)
返回状态码,200是成功
print(response.getcode())
返回的是url地址
print(response.geturl())
获取一个状态信息
print(response.getheaders())
e.g.
url_web.request.urlretrieve(url_web,'baidu.html')
e.g.
url_img = 'https://t7.baidu.com/it/u=1285847167,3193778276&fm=193&f=GIF'
urllib.request.urlretrieve(url = url_img,filename='shanghai.png')
e.g.
url_video = "https://www.bilibili.com/video/BV1d54y1g7db?t=12.6&p=6"
urllib.request.urlretrieve(url=url_video,filename='pyspider.mp4')
http/https(协议) 注:http 80 /https 443/ mysql 3306/ oracle 1521/ redis 6379/ mongodb 27017
www.baidu.com(主机)
80/443(端口号)
s(路径)
wd=XX(参数)
#(锚点)
import urllib.request
url='https://www.baidu.com'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
#因为urlopen方法中不能存储字典,故headers不能传递
# 请求对象的定制
#因为参数顺序问题,不能直接写url和headers,因为中间有data,故用关键字传参
request = urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
print(content)
#将 计算机 三个字转换成Unicode编码格式 用urllib.parse.quote('计算机')
import urllib.request,urllib.parse
url = 'https://www.baidu.com/s?wd=计算机'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
#将 计算机 三个字转换成Unicode编码格式 用urllib.parse.quote('计算机')
name = urllib.parse.quote('计算机')
url = url + name
#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应的内容
content = response.read().decode('utf-8')
#打印数据
print(content)
#多个参数 用urlencode
import urllib.parse
import urllib.request
base_url = 'https://www.baidu.com/s?'
data = {
'wd':'计算机',
'function':'许多'
}
#编码,将一个字典里的中文转换成Unicode码
new_data = urllib.parse.urlencode(data)
#请求资源路径
url = base_url + new_data
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
注:两个注意点
#post请求的参数,必须进行编码 data = urllib.parse.urlencode(data).encode('utf-8') #post请求的参数,需要放在请求对象的定制里的data中 request = urllib.request.Request(url=url,data=data,headers=headers)
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
data = {
'kw':'spider'
}
#post请求的参数,必须进行编码
data = urllib.parse.urlencode(data).encode('utf-8')
#post请求的参数,需要放在请求对象的定制里的data中
request = urllib.request.Request(url=url,data=data,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应的数据
content = response.read().decode('utf-8')
#字符串--》json对象
import json
obj = json.loads(content)
print(obj)
import urllib.request
url = 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action='
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
#(1)请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#(2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#(3)数据下载到本地
fp = open('douban.json','w',encoding='utf-8')
fp.write(content)
#https://movie.douban.com/top250?start=0
#https://movie.douban.com/top250?start=25&filter=
#https://movie.douban.com/top250?start=50&filter=
import urllib.request
import urllib.parse
def create_request(page):
base_url = 'https://movie.douban.com/top250?'
data = {
'start':(page-1)*25,
'filter':25
}
data = urllib.parse.urlencode(data)
url = base_url + data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page,content):
with open('douban_'+ str(page) +'.json','w',encoding='utf-8') as fp:
fp.write(content)
#程序的入口
if __name__ == '__main__':
start_page = int(input('请输入起始的页码'))
end_page = int(input('请输入结束的页码'))
for page in range(start_page,end_page+1):
#每一页的请求定制
request = create_request(page)
#获取响应的数据
content = get_content(request)
#打印数据
down_load(page,content)
#1
#https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
#post
# cname: 福州
# pid:
# pageIndex: 1
# pageSize: 10
#2
#https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
#post
# cname: 福州
# pid:
# pageIndex: 2
# pageSize: 10
import urllib.request
import urllib.parse
def create_request(page):
base_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': '福州',
'pid':'',
'pageIndex': page,
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=base_url,data=data,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page,content):
with open('kfc_' + str(page) + '.json','w',encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束的页码'))
for page in range(start_page,end_page+1):
#请求对象的定制
request = create_request(page)
#获取响应的数据
content = get_content(request)
#打印数据
down_load(page,content)
1、urllib.error.HTTPError
2、urllib.error.URLError
import urllib.request
import urllib.error
url = 'https://blog.csdn.net/m0_73898323/article/details/134678266'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
try:
request = urllib.request.Request(url = url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('404')
except urllib.error.URLError:
print('系统正在升级。。。')
对于一些需要登录的网页,我们需要在headers里加入cookie或Referer。
注:cookie里保存着我们的登录信息,Referer里一个图片防盗链。
作用:定制更高级的请求头,如动态cookie和代理不能使用请求对象的定制。
import urllib.request
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
#获取handler对象
handler = urllib.request.HTTPHandler()
#获取opener对象
opener = urllib.request.build_opener(handler)
#调用open方法
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
代码配置代理
(1)创建request对象
(2)创建ProxyHandler对象
(3)用Handler对象创建opener对象
(4)使用opener.open函数发送请求
import urllib.request
url = 'http://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#模拟浏览器访问服务器
#response = urllib.request.urlopen(request)
proxies = {
'http':'124.71.157.181:8020'
}
handler = urllib.request.ProxyHandler(proxies = proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
#获取响应的信息
content = response.read().decode('utf-8')
#保存
fp = open('daili.html','w',encoding='utf-8')
fp.write(content)
注:如果是代理池
import random
data={
'http':'124.71.157.181:8020',
'http':'121.37.205.253:10001'
}
proxies=random.choice(data)