‘
验证码和爬虫之间的爱恨情仇?
识别验证码的操作:
注册
登录
使用接口
from chaojiying import Chaojiying_Client # 导入文件
# 用户中心>>软件ID 里面生成软件 ID
chaojiying = Chaojiying_Client('用户名', '密码', '软件ID')
# 本地图片文件路径
im = open('图片路径', 'rb').read() # im是图片的所有字节
# 1902 验证码类型 官方网站>>价格体系
print(chaojiying.PostPic(im, 1902)['pic_str']) # 得到字符串类型的验证码
编码流程
爬取某些基于某些用户数据的信息
点击登录按钮之后会发起一个 post 请求
post 请求中会携带登录之前录入的相关的登录信息
http / https 协议特性:无状态信息,即,发起第二次基于个人主页请求的时候,服务器端并不知道该次请求是基于登录状态下的请求
cookie:用来让服务器记录
手动获取:通过抓包工具获取 cookie 值,将该值封装到 headers 中
自动获取:
cookie 来源
session 会话对象
session = requests.Session()
使用 selenium 登录,获取 cookie
from fake_useragent import UserAgent as ua # 随机生成ua
import requests
def get_cookies(url, name, pwd):
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
option = ChromeOptions()
option.add_argument("--headless")
option.add_argument('--disable-gpu')
option.add_argument(f'user-agent={ua().random}')
web = Chrome(options=option)
web.get(url=url)
web.find_element(By.XPATH, '//*[@id="username"]').send_keys(name)
web.find_element(By.ID, "password").send_keys(pwd, Keys.ENTER)
cookies = web.get_cookies() # 得到 cookies
web.close()
cookie = {}
for i in cookies:
cookie[i["name"]] = i["value"]
return cookie
def get_page_source(url, name, password):
resp = requests.get(url=url, cookies=get_cookies(url, name, pwd), headers={
"user-agent": ua().random,
})
return resp
什么是代理:
代理的作用
构建代理 ip 池
代理 ip 类型:
from requests import get
from fake_useragent import UserAgent as ua
from pyquery import PyQuery
def test_ip(ip):
url = "https://www.baidu.com/s?wd=ip"
resp = get(url=url, headers={
"user-agent": ua().random,
}, proxies={
"https": ip,
"http": ip
}, timeout=10)
doc = PyQuery(resp.text)('tr > td > span[@class="c-gap-right"]').text().split(":")[-1].strip()
print(doc)
代理 ip 的匿名度
目的:在爬虫中使用异步实现高性能的数据爬取操作
同步爬虫:
from time import time
from requests import get
urls = [
"http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg",
]
def get_content(url):
print("正在获取:", url)
resp = get(url=url).content
name = url.split("/")[-1]
if resp:
with open(f"./img/{name}", "wb") as f:
f.write(resp)
print("下载完成")
else:
print("请求失败")
sta = time()
for i in urls:
get_content(i)
print(f"全部下载完成,用时{time() - sta}")
异步爬虫:
from time import time
from requests import get
import asyncio
urls = [
"http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg",
]
async def get_content(url):
print("开始下载:", url)
name = url.split("/")[-1]
loop = asyncio.get_event_loop()
fut = loop.run_in_executor(None, get, url) # 同步方法等待
resp = await fut
if resp:
with open(f"./img/{name}", "wb") as f:
f.write(resp.content)
print("下载完成")
else:
print("请求失败")
sta = time()
tasks = [get_content(i) for i in urls]
asyncio.run(asyncio.wait(tasks))
print(f"全部下载完成,用时{time() - sta}")
多线程 ,多进程:
from time import time
from requests import get
from threading import Thread
class MyThread(Thread):
def __init__(self, target, args=(), kwargs={}):
super(MyThread, self).__init__()
self.daemon = True
self.target = target
self.args = args
self.kwargs = kwargs
self.start() # 自动开启进程
def run(self):
self.target(*self.args, **self.kwargs)
urls = [
"http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg",
]
def get_content(url):
print("正在获取:", url)
resp = get(url=url).content
name = url.split("/")[-1]
if resp:
with open(f"./img/{name}", "wb") as f:
f.write(resp)
print("下载完成")
else:
print("请求失败")
sta = time()
lis = []
for i in urls:
mt = MyThread(get_content, args=(i, ))
lis.append(mt)
for i in lis:
i.join()
print(f"全部下载完成,用时{time() - sta}")
线程池,进程池:
from time import time
from requests import get
from concurrent.futures import ThreadPoolExecutor
urls = [
"http://kr.shanghai-jiuxin.com/file/mm/20211129/qgenlhwzyvs.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/c0b455b1f25dec71d995550b2e9f898e.jpg",
"http://kr.shanghai-jiuxin.com/file/2020/0223/8e3674af90cba4a3fcfcfce30ab9e5b3.jpg",
]
def get_content(url):
print("正在获取:", url)
resp = get(url=url).content
name = url.split("/")[-1]
if resp:
with open(f"./img/{name}", "wb") as f:
f.write(resp)
print("下载完成")
else:
print("请求失败")
sta = time()
with ThreadPoolExecutor(len(urls)) as t:
t.map(get_content, urls)
print(f"全部下载完成,用时{time() - sta}")
单线程 + 协程(推荐)
import asyncio
async def request(url):
print("正在请求:", url)
print("请求成功")
return url
# async修饰的函数,调用之后返回的一个协程对象
c = request("www.baidu.com")
# # 创建一个任务循环对象
# loop = asyncio.get_event_loop()
# # task 的使用:基于 loop 创建一个对象
# task = loop.create_task(c)
# print(task)
# # 将协程对象注册到 loop 中
# loop.run_until_complete(task)
# print(task)
# loop = asyncio.get_event_loop()
# # future 的使用,不用基于协程对象
# task = asyncio.ensure_future(c)
# print(task)
# loop.run_until_complete(task)
# print(task)
# 绑定回调
def callback(task):
print(task.result())
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
# 将回调函数绑定到任务对象中
task.add_done_callback(callback)
loop.run_until_complete(task)
import asyncio
# 在异步协程中如果出现了同步模块相关的代码,那么久无法实现异步
async def request(url):
print("正在下载:", url)
await asyncio.sleep(3)
print("下载完毕:", url)
return url
# 任务列表
tasks = [request(i) for i in range(10)]
done, pending = asyncio.run(asyncio.wait(tasks))
print(done)
#------------------------------
# 也可
tasks = []
for i in range(10):
task = asyncio.ensure_future(request(i))
tasks.append(task)
loop = asyncio.get_event_loop()
done, pending = loop.run_until_complete(asyncio.wait(tasks))
print(done)
aiohttp:基于异步网络请求的模块
注意:在获取响应数据操作之前一定要使用 await 进行手动挂起
text():返回字符串形式的响应数据
read():返回二进制形式的响应数据
json():返回的是 JSON 对象
requests:是基于同步的,必须使用基于异步的网络请求模块进行指定的url的请求发送
selenium 基本使用
问题:selenium 模块和爬虫之间有怎么样的关系?
什么是 selenium 模块?
使用流程:
环境安装:pip install selenium
下载浏览器驱动程序
实例化浏览器对象
编写基于浏览器自动化的操作代码
如果定位的标签是存在于 iframe 标签中的则必须要进入 iframe 标签内在进行定位
# 先通过xpth定位到iframe
xf = driver.find_element_by_xpath('//*[@id="x-URS-iframe"]')
# 再将定位对象传给switch_to_frame()方法
driver.switch_to_frame(xf)
driver.switch_to_default_content() # 退出框架
from selenium.webdriver.common.action_chains import ActionChains
# 实例化动作链
act = ActionChains(web)
# 点击长按
act.click_and_hold(div)
for i in range(5):
act.move_by_offset(17, 0).perform() # perform 表示立即执行动作链动作
# 释放动作链
act.release()
web.quit()