今天主要讲一下在处理账号密码类的登陆问题,我这边采取的解决办法是selenium+chrome方案,包括的内容有滑块验证码的破解原理 ,点击样式的验证码的破解以及进入登陆界面后的cookie 转换成requests中的cookies(这里主要是提升爬取的效率):
这里直接上代码 :
from selenium import webdriver
options = webdriver.ChromeOptions()
#设置中文
options.add_argument('lang=zh_CN.UTF-8')
#更换头部
ua =UserAgent()
options.add_argument('user-agent="%s"' % ua.random)
# options.add_argument('--proxy-server=http://'+get_proxy())
# options.add_argument("--headless")
driver=webdriver.Chrome("D:\chromedriver.exe",chrome_options=options)
driver.maximize_window()
driver.get("https://passport.lagou.com/login/login.html")
sleep(4)
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input").send_keys(username)
sleep(random.randint(1,2))
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input").send_keys(password)
sleep(random.randint(1,2))
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input").click()
sleep(random.randint(3,6))
接下来就是会出现验证码的情况:
这里分三种情况来讲述
1.没有出现任何验证码,那就直接进行登陆跳转了
2.出现滑块验证码的识别
这里先剖代码(这里使用春秋的滑块作为讲解,主要是拉钩变化无常,原理一样):
使用的验证码代理是超级鹰
from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from chaojiying import Chaojiying_Client
#coding=utf-8
import sys
import importlib
importlib.reload(sys)
from PIL import Image,ImageDraw,ImageFont
import requests
from get_track import get_track
import random
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
browser = webdriver.Chrome("D:/chromedriver.exe")
browser.get('https://account.ch.com/NonRegistrations-Regist')
sleep(2)
browser.find_element_by_xpath("/html/body/div[3]/div[1]/div/div[3]/div[1]/div[2]").click()
sleep(1)
browser.find_element_by_xpath("/html/body/div[3]/div[1]/div/div[3]/div[3]/div[1]/input").send_keys("[email protected]")
browser.find_element_by_xpath('/html/body/div[3]/div[1]/div/div[3]/div[3]/div[2]/input').send_keys("h123456")
sleep(3)
browser.find_element_by_xpath('/html/body/div[3]/div[1]/div/div[3]/div[3]/div[3]/input').send_keys("h123456")
sleep(3)
browser.find_element_by_xpath('//*[@id="emailRegist"]').click()
sleep(6)
while 1:
try:
canvas=browser.find_element_by_xpath("//canvas[@class='geetest_canvas_slice geetest_absolute']")
browser.save_screenshot(r'photo.png')
left = canvas.location['x']
print("大背景图的左边的坐标",left)
top = canvas.location['y']
elementWidth = canvas.location['x'] + canvas.size['width']
elementHeight = canvas.location['y'] + canvas.size['height']
picture = Image.open(r'photo.png')
picture = picture.crop((left, top, elementWidth, elementHeight))
picture.save(r'photo2.png')
im= Image.open("photo2.png").convert("RGBA")
font = ImageFont.truetype('simsun.ttc',16)
d=ImageDraw.Draw(im)
d.text((0,0),u'请点击凹槽左上角',(255,0,0),font=font)
d=ImageDraw.Draw(im)
im.save("photo2.png")
sleep(5)
chaojiying = Chaojiying_Client('995368208qqcom', 'hs456', '897551') #验证码平台的识别函数,可以咨询客服
im = open('photo2.png', 'rb').read()
xy=chaojiying.PostPic(im, 9101)
y=xy["pic_str"]
id=xy["pic_id"]
print(id)
print(y)
if y:
x=int(y.split(",")[0])
slider = browser.find_element_by_xpath("/html/body/div[6]/div[2]/div[6]/div/div[1]/div[2]/div[2]")
x_left= slider.size['width']
print("滑块的左边的坐标",x_left)
print(type(x_left))
distance = x-33
track = get_track(distance)
ActionChains(browser).click_and_hold(slider).perform()
for t in track:
ActionChains(browser).move_by_offset(xoffset=t, yoffset=0).perform()
sleep(0.5)
ActionChains(browser).release().perform()
sleep(2)
if not canvas:
break
sleep(random.randint(1,3))
else:
pass
except:
break
这里我讲一下主要的方法 就是 超级鹰的验证码破解 是上传整个验证码的图片就是如下图:
首先就是用selenium进行元素截图(自行百度如何截屏,很简单)获取这个图片发送个验证码平台,然后再进行
原点的坐标是1处的那个坐上拐角,识别出来返回的坐标是2处的左上处的拐角坐标,然后利用selenium中ActionChains(browser).click_and_hold(slider).perform() 来控制住滑块,然后再通过位移进行移动ActionChains(browser).move_by_offset(xoffset=t, yoffset=0).perform() 这里的t就是位移,这里面有个技巧就是那个位移是需要做减法得到,我这里给大家讲一下我自己的经验,由我们得到的也就是超级鹰返回的坐标(x,y)减去滑块的坐标这里你可以参照selenium中如何量取元素的坐标,得到元素的左上的坐标 ,宽高的像素大小,如果你不知道具体减去的是啥,你可以自己去尝试,滑块的左边x方向的坐标,滑块中间点的坐标,滑块右边的坐标,(拉钩是中间的坐标),然后我们得到移动的位移,最后我们还需要做个模拟人为因素的拖动,这里我采用的比较简单的加速,减速的方案,如果不行可以参照其他的方案,基本原理都是一样的!
def get_track(distance):
'''
:param distance: (Int)缺口离滑块的距离
:return: (List)移动轨迹
'''
# 创建存放轨迹信息的列表
trace = []
# 设置加速的距离
faster_distance = distance*(4/5)
# 设置初始位置、初始速度、时间间隔
start, v0, t = 0, 0, 0.2
# 当尚未移动到终点时
while start < distance:
# 如果处于加速阶段
if start < faster_distance:
# 设置加速度为2
a = 1.5
# 如果处于减速阶段
else:
# 设置加速度为-3
a = -3
# 移动的距离公式
move = v0 * t + 1 / 2 * a * t * t
# 此刻速度
v = v0 + a * t
# 重置初速度
v0 = v
# 重置起点
start += move
# 将移动的距离加入轨迹列表
trace.append(round(move))
# 返回轨迹信息
return trace
以上就是滑块的破解原理跟想法 ,如果有疑问可以加我微信 :13270870157我们继续讨论
3.还有最后一个验证码的识别就是点击识别,这里采用的打吗平台的识别
具体代码的识别代码:
from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from chaojiying import Chaojiying_Client
import sys
import importlib
importlib.reload(sys)
from PIL import Image,ImageDraw,ImageFont
from fake_useragent import UserAgent
import random
from urllib.request import urlretrieve
from lxml import etree
import pyautogui as pag
def get_proxy():
List = []
with open("get_ip", 'r', encoding="utf-8") as f:
for i in f:
List.append(i.rstrip())
List = list(set(List))
return random.choice(List[99:])
options = webdriver.ChromeOptions()
#设置中文
options.add_argument('lang=zh_CN.UTF-8')
#更换头部
ua =UserAgent()
# options.add_argument('user-agent="%s"' % ua.random)
# options.add_argument('--proxy-server=http://'+get_proxy())
# options.add_argument("--headless")
driver=webdriver.Chrome("D:\chromedriver.exe",chrome_options=options)
driver.maximize_window()
driver.get("https://passport.lagou.com/login/login.html")
sleep(2)
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input").send_keys('13270870157')
sleep(random.randint(1,2))
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input").send_keys("h456")
sleep(random.randint(1,2))
driver.find_element_by_xpath("/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input").click()
sleep(random.randint(3,6))
html=etree.HTML(driver.page_source)
img_url = "".join(html.xpath("//img[@class='geetest_item_img']/@src"))
print(img_url)
urlretrieve(img_url, 'img.png')
chaojiying = Chaojiying_Client('995368208qqcom', 'h23456', '897551')
im = open('img.png', 'rb').read()
xy = chaojiying.PostPic(im,9004)
locations = xy["pic_str"].split("|")
#找到那个图片的坐标(左上角的坐标)
logo = driver.find_element_by_xpath("//img[@class='geetest_item_img']")
x=logo.location["x"]
y=logo.location["y"]
if len(locations)==1:
print(locations)
location_x1,location_y1=locations[0].split(",")
print(location_x1, location_y1)
x1 = int(location_x1)+x
y1 = int(location_y1)+y
pag.click(x1,y1)
elif len(locations)==2:
print(locations)
location_x1,location_y1=locations[0].split(",")
print(location_x1, location_y1)
x1 = int(location_x1) + x
y1 = int(location_y1) + y
pag.click(x1, y1)
sleep(1)
location_x2,location_y2=locations[1].split(",")
x2 = int(location_x2) + x
y2 = int(location_y2) + y
pag.click(x2, y2)
sleep(1)
elif len(locations)==3:
print(locations)
location_x1,location_y1=locations[0].split(",")
print(location_x1,location_y1)
x1 = int(location_x1) + x
y1 = int(location_y1) + y
pag.click(x1, y1)
sleep(1)
location_x2, location_y2 = locations[1].split(",")
x2 = int(location_x2) + x
y2 = int(location_y2) + y
pag.click(x2, y2)
sleep(1)
location_x3, location_y3 = locations[2].split(",")
x3 = int(location_x3) + x
y3 = int(location_y3) + y
pag.click(x3, y3)
sleep(1)
else:
print(locations)
location_x1, location_y1 = locations[0].split(",")
x1 = int(location_x1) + x
y1 = int(location_y1) + y
pag.click(x1, y1)
sleep(1)
location_x2, location_y2 = locations[1].split(",")
x2 = int(location_x2) + x
y2 = int(location_y2) + y
pag.click(x2, y2)
sleep(1)
location_x3, location_y3 = locations[2].split(",")
x3 = int(location_x3) + x
y3 = int(location_y3) + y
pag.click(x3, y3)
sleep(1)
location_x4, location_y4 = locations[3].split(",")
x4 = int(location_x4) + x
y4 = int(location_y4) + y
pag.click(x4, y4)
sleep(1)
主要的原理也是selenium进行截图发给验证码平台,返回坐标,只不过这里面需要做个判定,识别的字有几个,
这里面有个原点就是你上传图片的左上角的那个拐角点的坐标,那这里需要用到selenium来进行点选,这里面的pag.click()已经不能使用,
这里采用:
ActionChains方法列表
move_by_offset(xoffset, yoffset) ——鼠标从当前位置移动到某个坐标
move_to_element(to_element) ——鼠标移动到某个元素
move_to_element_with_offset(to_element, xoffset, yoffset) ——移动到距某个元素(左上角坐标)多少距离的位置
ActionChains(browser).move_to_element_with_offset(to_element, xoffset, yoffset) .perform()
这里面的to_element 就是我们所选区的那个图片的div ,后面的参数就是返回的坐标,很简单直接输入就可以了,然后就可以出现点选了
综上所述 结束,有问题 微信 :13270870157