sliderCode .py
from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
# 辅助函数1、截图
def shot_img(driver):
sleep(2)
# 截整个浏览器界面的图
driver.save_screenshot("./page.png")
# 加载截图
img = Image.open("./page.png")
# 从整张界面图中提取出验证码区域的截图
loc = driver.find_element_by_class_name("geetest_slicebg").location # 取出验证码区域的位置
size = driver.find_element_by_class_name("geetest_slicebg").size # 取出验证码区域的大小
print(loc,size)
# 计算验证码区域的截图范围
top = loc["y"]
left = loc["x"]
right = loc["x"] + size["width"]
bottom = loc["y"] + size["height"]
# 根据截图区域的范围来截取出图片
code_img = img.crop((left*2,top*2,right*2,bottom*2)) # 经验告诉我乘以2好使
# code_img.show()
return code_img
# 辅助函数2、就是缺口的距离
def get_distance(img1,img2):
# 寻找缺口图和无缺口的图之间像素的差异,只需要找到第一个存在差异的像素取出该像素的x位置就是缺口到起始点的距离
for i in range(50,img1.size[0]):
for j in range(img1.size[1]):
# 加载rgb值
rgb1 = img1.load()[i,j]
rgb2 = img2.load()[i,j]
# 计算两张图片rbg的差异
r = abs(rgb1[0] - rgb2[0])
g = abs(rgb1[1] - rgb2[1])
b = abs(rgb1[2] - rgb2[2])
# 判断,如果r、g、b三个的差异都大于60,则判断为缺口位置
if r>60 and g>60 and b>60:
return i/2 - 6
# 辅助函数3、生成一个滑块移动的轨迹
def get_tracks(distance):
# 拖动的时候多拖出去20
distance += 20
v = 0
t = 0.2
# 定义一个列表,用于存放向前滑动的轨迹(每次向前滑动的距离)
forwards = []
# 当前位置
current = 0
# 中间位置
mid = distance*3/5
while current < distance:
if current < mid:
a = 2
else:
a = -3
s = v*t + 0.5*a*(t**2)
v = a*t + v
current += s
forwards.append(round(s))
return {"forwards":forwards,"backs":[-3,-3,-2,-2,-3,-2,-2,-1,-1,-1]}
# 封装一个函数,用于破解滑动验证码
def crack_code(driver):
# 1、求滑动距离
# 1)截取带缺口的图
img1 = shot_img(driver)
# 2)去掉缺口
# 写一个js语句就可以执行然后去缺口
js = "document.querySelector('.geetest_canvas_slice').style.display='block';document.querySelector('.geetest_canvas_slice').style.zIndex=10;document.querySelector('.geetest_canvas_fullbg').style.display='block';"
driver.execute_script(js)
# 3)截取不带缺口的图
img2 = shot_img(driver)
# 4)两张图都截取完毕了,接下来要让图片恢复原状
js = "document.querySelector('.geetest_canvas_slice').style.display='block';document.querySelector('.geetest_canvas_slice').style.zIndex=10;document.querySelector('.geetest_canvas_fullbg').style.display='none';"
driver.execute_script(js)
# 5)根据带缺口和不带缺口的两张图来计算法缺口和起始点的距离
distance = get_distance(img1,img2)
print(distance)
# 2、模拟人类的动作来滑动滑块
# 用ActionChains对象来模拟人类动作
btn = driver.find_element_by_class_name("geetest_slider_button")
# 按住按钮
ActionChains(driver).click_and_hold(btn).perform()
# 按照一定的轨迹来拖动
# 向前的轨迹
tracks = get_tracks(distance)
for track in tracks["forwards"]:
ActionChains(driver).move_by_offset(yoffset=0,xoffset=track).perform()
sleep(0.5)
for track in tracks["backs"]:
ActionChains(driver).move_by_offset(yoffset=0,xoffset=track).perform()
sleep(0.5)
# 松开手
ActionChains(driver).release().perform()
# 封装一个函数用于登录
def login_blogs(name,password):
# 登录页面的接口
login_page = "https://account.cnblogs.com/signin"
driver = webdriver.Chrome(executable_path=r"C:\Users\fanjianbo\Desktop\chromedriver_win32\chromedriver.exe")
try:
driver.get(login_page)
sleep(1)
# 找到相关的表单将用户名和密码输入
driver.find_element_by_id("LoginName").send_keys(name)
driver.find_element_by_id("Password").send_keys(password)
# 点击登录按钮
driver.find_element_by_class_name("ladda-label").click()
# 登录按钮按下以后会弹出滑动验证码界面,接下来破解
crack_code(driver)
sleep(5)
finally:
driver.close()
if __name__ == '__main__':
login_blogs("qwer","qwer")
login .py
import sliderCode
from lxml import etree
from time import sleep
from selenium import webdriver
# 封装一个函数用于登录
def login_weibo(url,name,password):
driver = webdriver.Chrome(executable_path=r"C:\Users\fanjianbo\Desktop\chromedriver_win32\chromedriver.exe")
driver.get(url)
sleep(1)
driver.find_element_by_id("loginName").send_keys(name)
driver.find_element_by_id("loginPassword").send_keys(password)
driver.find_element_by_id("loginAction").click()
sleep(5)
try:
driver.find_element_by_class_name("geetest_radar_tip").click()
sleep(0.5)
# 判断是否是滑块验证码
tree = etree.HTML(driver.page_source)
slice = tree.xpath("//canvas[starts-with(@class,'geetest_canvas_slice')]")
if len(slice) == 0:
# 说明不是滑块验证码
sleep(10) # 在5s中手工输入
else:
# 是滑块验证码
sliderCode.crack_code(driver)
except Exception as e:
print(e)
print("已经登录成功,无需再次重复验证!")
# 获取页面的cookie
cookies = driver.get_cookies()
driver.quit()
# print(cookies)
# 对cookie进行整合
cookie_list = []
for cookie in cookies:
cookie_list.append(str(cookie["name"]) + "=" + str(cookie["value"]))
return ";".join(cookie_list)
if __name__ == '__main__':
#登录页面接口
login_page_url = "https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt="
# 登录
cookies = login_weibo(login_page_url,"18610593606","a1234567890")
# print(cookies)
with open("cookies.txt","w") as fp:
fp.write(cookies)
weiboSpider .py
import requests
from lxml import etree
from time import sleep
import re
# 1、【数据爬取】
def fetch_pages(url,cookies):
# 请求头
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',"cookie":cookies}
# 请求首页
first_page = requests.get(url=url,headers=headers)
print(first_page.text)
# 从first_page中找下级页面的链接
start = len('')
print(first_page.text[start:])
first_page_tree = etree.HTML(first_page.text[start:])
sleep(1)
yield first_page.text[start:]
next_url = "https://weibo.cn" + first_page_tree.xpath("//div[@id='pagelist']//a[1]/@href")[0]
next_page = requests.get(url=next_url,headers=headers)
next_tree = etree.HTML(next_page.text[start:])
yield next_page.text[start:]
for i in range(6):
next_url = "https://weibo.cn" + next_tree.xpath("//div[@id='pagelist']//a[1]/@href")[0]
next_page = requests.get(url=next_url, headers=headers)
next_tree = etree.HTML(next_page.text[start:])
sleep(1)
# print(next_page)
yield next_page.text[start:]
# 2、解析模块
def analysis_pages(page_list):
for page in page_list:
# print(page)
page_tree = etree.HTML(page)
# # 获取页面上的所有的微博
weibo_list = page_tree.xpath("//div[@class='c' and @id]")
# 遍历所有的微博,并且分成4类解析
# 要求如下:
# 原创不带图(博主、内容、点赞数、转发数、评论数),原创带图(博主、内容、点赞数、转发数、评论数、图片路径) 转发不带图(博主、内容、点赞数、转发数、评论数,转发理由) 转发带图(博主、内容、点赞数、转发数、评论数、图片路径、转发理由)
for weibo in weibo_list:
item = {}
# 按照div的个来区分
div_list = weibo.xpath("./div")
num = len(div_list)
if num == 1:
# 原创不带图
item["flag"] = "YN"
item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
item["dianzan"] = re.findall(pattern=r"[0-9]+",string=weibo.xpath(".//a/text()")[-4])[0]
item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//a/text()")[-3])[0]
item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//a/text()")[-2])[0]
elif num==2:
# 两种情况
item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
item["dianzan"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-4])[0]
item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-3])[0]
item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-2])[0]
# 查找图片
src = weibo.xpath(".//img[@class='ib']/@src")
if len(src) == 0:
# 转发不带图的
item["flag"] = "ZN"
item["liyou"] = weibo.xpath(".//div[2]//text()")[1]
else:
# 原创带图
item["flag"] = "YP"
item["pic"] = src[0]
else:
# 转发带图
item["flag"] = "ZP"
item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
item["dianzan"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-4])[0]
item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-3])[0]
item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-2])[0]
item["liyou"] = weibo.xpath(".//div[3]//text()")[1]
item["pic"] = weibo.xpath(".//img[@class='ib']/@src")[0]
print(item)
if __name__ == '__main__':
# 从本地文件中获取cookies缓存
with open("./cookies.txt","r") as fp:
cookies = fp.read()
print(cookies)
url = "https://weibo.cn/"
# 请求
page_list = fetch_pages(url=url,cookies=cookies)
analysis_pages(page_list)
当然,最后文件可存储为csv文件