requests模块
- 概念:基于网络请求的模块
- 作用:用来模拟浏览器发请求,从而实现爬虫
- 环境安装:pip install requests
- 编码流程:
- 指定url
- 发起请求
- 获取响应数据
- 持久化存储
示例:
1:爬取搜狗首页的页面源码数据
import requests
#1.指定url
url = 'https://www.sogou.com/'
#2.请求发送:get返回的是一个响应对象
response = requests.get(url=url)
#3.获取响应数据:text返回的是字符串形式的响应数据
page_text = response.text
#4.持久化存储
with open('sogou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
2:实现一个简易的网页采集器(请求参数的动态化)
url = 'https://www.sogou.com/web'
query = input('请输入参数:')
params = {
'query': query
}
response = requests.get(url=url, params=params)
file_name = query + '.html'
page_text = response.text
with open(file_name, 'w', encoding='utf-8') as fp:
fp.write(page_text)
上述代码问题:
- 乱码问题
- response.encoding = 'xxx'
- 数据丢失
- 反爬机制: UA(User-Agent)检测
- 反反爬策略: UA伪装
# 改进上述代码
url = 'https://www.sogou.com/web'
query = input('请输入参数:')
params = {
'query': query
}
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
response = requests.get(url=url, params=params, headers=headers)
# 指定编码格式
response.encoding = 'utf-8'
file_name = query + '.html'
page_text = response.text
with open(file_name, 'w', encoding='utf-8') as fp:
fp.write(page_text)
3:动态加载的数据
通过另一个网络请求(例如ajax)请求到的数据
爬取豆瓣电影中动态加载出的电影详情数据
url = 'https://movie.douban.com/j/chart/top_list'
params = {
'type': '5',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '10'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
response = requests.get(url=url, params=params, headers=headers)
# 在已知响应数据是json字符串时,可以使用json()直接获得反序列化的原数据
movie_list = response.json()
for movie in movie_list:
print(movie['title'], movie['score'])
总结: 对一个陌生的网站进行数据爬取的时候,首先要确定的一点就是爬取的数据是否为动态加载出来的
- 是: 需要通过抓包工具捕获到动态加载数据对应的数据包,从中提取出url和请求参数
- 不是: 直接对浏览器地址栏的url发起请求即可
如何检测爬取的数据是不是动态加载出来的?
- 通过抓包工具进行局部搜索(response)就可以验证数据是否为动态加载
- 搜索到: 不是动态加载
- 搜索不到: 是动态加载
如何定位动态加载的数据在哪呢?
- 通过抓包工具进行全局搜索进行定位
4:爬取肯德基餐厅位置信息
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
data = {
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': '1',
'pageSize': '10'
}
response = requests.post(url=url, data=data, headers=headers)
address_dic = response.json()
for address in address_dic['Table1']:
print(address['cityName'], address['addressDetail'])
5: 需求 https://www.fjggfw.gov.cn/Website/JYXXNew.aspx 福建省公共资源交易中心,提取完整的html中标信息
实现思路
- 确认爬取的数据都是动态加载出来的
- 在首页中捕获到ajax请求对应的数据包,从该数据包中提取出请求的url和请求参数
- 对提取到的url进行请求发送,获取响应数据(json)
- 从json串中提取到每一个公告对应的id值
- 将id值和中标信息对应的url进行整合,进行请求发送捕获到每一个公告对应的中标信息数据
post_url = 'https://www.fjggfw.gov.cn/Website/AjaxHandler/BuilderHandler.ashx'
# 此处用到了cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Cookie': '_qddac=4-3-1.4euvh3.dolhcp.k1hv0g18; ASP.NET_SessionId=zlodpss0z5marc42xbwf3k1z; Hm_lvt_94bfa5b89a33cebfead2f88d38657023=1570540077; __root_domain_v=.fjggfw.gov.cn; _qddaz=QD.xezyl8.p60887.k1hv0fxc; _qdda=4-1.4euvh3; _qddab=4-dolhcp.k1hv0g18; _qddamta_2852155767=4-0; Hm_lpvt_94bfa5b89a33cebfead2f88d38657023=1570540248; _qddagsx_02095bad0b=01ea1a6c5d3a64853ca5827a992c7e8755a27bdf6483c4170cf2f7408bf5160b8be84faf079220f53eb77ffddb8e7a31bb676d8e2335aa55f11f4fd4ea8e3ae123c0a5f18a8ab6b832b0d1b4888af4bdd0787e3a2fbda9234cb86cd2b05adf3e56d7e29aafcb05c7edc7e73de6cb346d19449446dc77234a6fb176cd0c0e4df4'
}
for n in range(1, 6):
data = {
'OPtype': 'GetListNew',
'pageNo': n,
'pageSize': '10',
'proArea': '-1',
'category': 'GCJS',
'announcementType': '-1',
'ProType': '-1',
'xmlx': '-1',
'projectName': '',
'TopTime': '2019-07-10 00:00:00',
'EndTime': '2019-10-08 23:59:59',
'rrr': '0.5270491290780797'
}
post_data = requests.post(url=post_url, data=data, headers=headers).json()
for i in post_data['data']:
mid = int(i['M_ID'])
url = f'https://www.fjggfw.gov.cn/Website/AjaxHandler/BuilderHandler.ashx?OPtype=GetGGInfoPC&ID={mid}&GGTYPE=5&url=AjaxHandler%2FBuilderHandler.ashx'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
data_dic = response.json()
with open('作业.text', 'a', encoding='utf-8') as f1:
f1.write(''.join(data_dic['data']))
f1.write('\n----------------------------')
6: 爬取图片
- 基于requests
- 基于urllib
- 区别: urllib中的urlretrieve不可以进行UA伪装
# 基于requests模块的图片爬取
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
url = 'http://tva1.sinaimg.cn/mw600/007QUzsKgy1g7qzr59hk7j30cs0gxn82.jpg'
# content返回的是bytes类型的响应数据
img_data = requests.get(url, headers=headers).content
with open('ceshi.jpg', 'wb') as f1:
f1.write(img_data)
# 基于urllib的图片爬取
from urllib import request
url = 'http://tva1.sinaimg.cn/mw600/007QUzsKgy1g7qzr59hk7j30cs0gxn82.jpg'
request.urlretrieve(url, 'ceshi2.jpg')
7: 反爬之图片懒加载
需求: 爬取http://sc.chinaz.com/tag_tupian/YaZhouMeiNv.html 网站的图片
import requests
import os
from lxml import etree
# http://sc.chinaz.com/tag_tupian/YaZhouMeiNv.html 网站中前5页的图片数据进行爬取和持久化存储
file_path = './作业/'
if not os.path.exists(file_path):
os.mkdir(file_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
url = 'http://sc.chinaz.com/tag_tupian/yazhoumeinv_%d.html'
for page in range(1, 6):
if page == 1:
new_url = 'http://sc.chinaz.com/tag_tupian/YaZhouMeiNv.html'
else:
new_url = url % page
page_text = requests.get(new_url, headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
title = div.xpath('./p/a/text()')[0].encode('iso-8859-1').decode('utf-8')
img_path = file_path + '/' + title + '.jpg'
img_url = div.xpath('./div/a/img/@src2')[0]
img_data = requests.get(img_url, headers=headers).content
with open(img_path, 'wb') as f1:
f1.write(img_data)
print('第{}页爬取完毕~~'.format(page))
反爬机制之: 图片懒加载
- 使用伪属性记录图片地址
- 当图片进入浏览器可视窗口时,才通过JS使其加载出来(将伪属性修改为src
对应的反反爬策略:
- 爬取伪属性所对应的属性
8: 爬取梨视频(JS动态加载视频地址)
import requests
import os
import re
from lxml import etree
# 梨视频短视频的爬取
file_path = './作业3/'
if not os.path.exists(file_path):
os.mkdir(file_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_59'
page_text = requests.get(url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//li[@class="categoryem "]')
for li in li_list:
title = li.xpath('./div/a/div[2]/text()')[0]
movie_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
movie_text = requests.get(movie_url, headers=headers).text
src = re.findall('srcUrl="(.*?)",vdoUrl', movie_text, re.S)[0]
movie_data = requests.get(src, headers=headers).content
movie_path = file_path + title + '.' + src.split('.')[-1]
with open(movie_path, 'wb') as f1:
f1.write(movie_data)
print('好了一个~')
print('全部爬取完毕~')
反爬机制之:
- JS动态加载视频地址
对应的反反爬策略:
- 使用正则匹配到JS代码中的视频地址
9: 反爬之IP检测(代理)
出现HttpConnectionPool(host:XX) Max retries exceeded with url错误
- 产生原因:
- 1.短时间内对服务器端发起了高频请求
- 处理: headers中加入Connection: 'close'
- 2.请求对应的ip被服务器端禁止
- 使用代理
反爬机制之:
- IP异常检测
对应的反反爬策略:
- 使用代理
代理操作
- 概念: 代理服务器
代理的作用?
- 请求和响应的转发(拦截请求和响应)
代理和爬虫之间的关联是什么?
- 可以基于代理实现更换爬虫程序请求的ip地址
代理ip的网站
- 西刺
- 快代理
- www.goubanjia.com
- 代理精灵: http://http.zhiliandaili.cn/
代理的匿名度
- 高匿
- 匿名
- 透明
类型
- http
- https
示例1:
# 使用代理的示例
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Connection': 'close'
}
url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
page_text = requests.get(url, headers=headers, proxies={'https': '182.85.41.159:41729'}).text
with open('./代理测试.html', 'w', encoding='utf-8') as f1:
f1.write(page_text)
示例2:
# 搭建一个免费的代理池
# 爬取西刺代理
url = 'https://www.xicidaili.com/nn/{}'
ip_list = []
for page in range(1, 51):
new_url = url.format(page)
page_text = requests.get(new_url, headers=headers).text
tree = etree.HTML(page_text)
# 注意,xpath中不能出现tbody标签
tr_list = tree.xpath('//table//tr')[1:]
for tr in tr_list:
dic = {}
dic['ip_port'] = tr.xpath('./td[2]/text()')
dic['agreement'] = tr.xpath('./td[6]/text()')
ip_list.append(dic)
print(len(ip_list))
# 多次爬取后发现,ip被服务器端封禁
# 使用代理进行反反爬
示例3:
# 构建一个付费的代理池
import random
url = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=2&fa=0&fetch_key=&groupid=0&qty=50&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=15'
page_text = requests.get(url, headers=headers).text
tree = etree.HTML(page_text)
ip_list = tree.xpath('//body//text()')
ips_pool = []
for ip in ip_list:
ips_pool.append({'https': ip})
url = 'https://www.xicidaili.com/nn/{}'
ip_list = []
for page in range(1, 51):
new_url = url.format(page)
try:
page_text = requests.get(new_url, headers=headers, proxies=random.choice(ips_pool)).text
tree = etree.HTML(page_text)
# 注意,xpath表达式中不能出现tbody标签
tr_list = tree.xpath('//table//tr')[1:]
for tr in tr_list:
dic = {}
dic['ip_port'] = tr.xpath('./td[2]/text()')
dic['agreement'] = tr.xpath('./td[6]/text()')
ip_list.append(dic)
except Exception:
pass
print(len(ip_list))
10: 反爬之cookie
需求:
- 爬取www.xueqiu.com 中的新闻数据
爬虫中处理cookie的操作
- 手动处理: 将cookie写在headers中
- 自动处理: session对象
- 获取session对象: requests.Session()
- 作用:
- session对象和requests对象都可以对指定的url进行请求发送,只不过使用session对象进行请求发送如果产生了cookie,则cookie会被自动保存在session对象中.
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20352414&count=15&category=-1'
page_text = requests.get(url, headers=headers).json()
# 'error_description': '遇到错误,请刷新页面或者重新登录帐号后再试',
# 基于cookie操作的修正
session = requests.Session()
cookie_url = 'https://xueqiu.com'
# 首先使用session对象向https://xueqiu.com发送一次请求,记录产生的cookie
session.get(cookie_url, headers=headers)
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20352414&count=15&category=-1'
# 保证该次请求携带对应的cookie才可以请求成功
page_text = session.get(url, headers=headers).json()
print(page_text)
11: 模拟登陆&验证码的识别&动态请求参数
使用线上的打码平台进行自动的识别验证码
- 云打码
- 超级鹰
- 注册,登录
- 创建一个软件
- 下载示例代码(开发文档中)
开发文档:
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
获取验证码:
# 获取验证码
def get_code(img_path, img_type):
chaojiying = Chaojiying_Client('账号', '密码', '软件id')
im = open(img_path, 'rb').read()
return chaojiying.PostPic(im, img_type)['pic_str']
爬取代码:
from lxml import etree
login_url = 'https://so.gushiwen.org/user/login.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Connection': 'close'
}
session = requests.Session()
login_text = session.get(login_url, headers=headers).text
tree = etree.HTML(login_text)
# 验证码的识别: 将验证码下载到本地然后提交给打码平台进行识别
# 事后发现: cookie是在获取验证码图片时产生的
code_img_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
code_img_data = session.get(code_img_url, headers=headers).content
with open('./验证码.jpg', 'wb') as f1:
f1.write(code_img_data)
code = get_code('./验证码.jpg', 1004)
# 获取动态的请求参数
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
'from': 'http://so.gushiwen.org/user/collect.aspx',
'email': '账号',
'pwd': '密码',
'code': code,
'denglu': '登录'
}
page_text = session.post(login_url, data=data, headers=headers).text
with open('./测试.html', 'w', encoding='utf-8') as f1:
f1.write(page_text)
如何捕获动态变化的请求参数
- 通常情况下,动态变化的请求参数都会被隐藏在前台页面源码数据中
目前接触到的反爬机制及反反爬策略:
- robots: 不管它
- UA检测: UA伪装
- 图片懒加载: 使用伪属性获取图片地址
- IP检测(代理): 使用代理
- cookie: 使用session对象
- 验证码: 使用线上打码平台解析验证码
- 动态变化的请求参数: 从页面源码中解析动态变化的请求参数
- 动态加载的数据: 使用抓包工具,全局搜索,定位到动态加载的数据的位置
12: 使用线程池提升爬取数据的效率
同步操作的代码:
import time
# 同步操作的代码
def request(url):
print('正在请求:', url)
time.sleep(2)
print('请求完毕:', url)
urls = [
'www.1.com',
'www.2.com',
'www.3.com'
]
start = time.time()
for url in urls:
request(url)
print('总耗时:', time.time() - start)
# 6秒
基于线程池的异步操作代码:
import time
from multiprocessing.dummy import Pool # 线程池
# 基于线程池的异步操作代码
pool = Pool(3)
def request(url):
print('正在请求:', url)
time.sleep(2)
print('请求完毕:', url)
urls = [
'www.1.com',
'www.2.com',
'www.3.com'
]
start = time.time()
pool.map(request, urls)
print('总耗时:', time.time() - start)
# 2秒
爬虫加线程池:
# 爬虫加线程池
import time
import requests
from multiprocessing.dummy import Pool # 线程池
from lxml import etree
urls = [
'http://127.0.0.1:5000/hxbs',
'http://127.0.0.1:5000/index'
]
# 发送请求
def get_request(url):
page_text = requests.get(url).text
return page_text
# 解析数据
def parse(page_text):
tree = etree.HTML(page_text)
print(tree.xpath('//div[1]//text()'))
pool = Pool(2)
start = time.time()
page_text_list = pool.map(get_request, urls)
print(len(page_text_list))
pool.map(parse, page_text_list)
print('总耗时:', time.time() - start)