反爬机制:验证码。需要识别验证码图片中的数据,用于模拟登陆操作。
人工肉眼识别(不推荐)
第三方自动识别(推荐)
本文通过OCR技术来对验证码图片数据识别(推荐)
需求:识别古诗文网登陆页面中的验证码
验证码识别封装在VerificationCode.py
文件里,具体代码如下:
import re # 用于正则
from PIL import Image # 用于打开图片和对图片处理
import pytesseract # 用于图片转文字
import time # 代码运行停顿
class VerificationCode:
"""识别验证码图片"""
def __init__(self, img_path):
self.img_path = img_path
def processing_image(self):
"""处理图片"""
image_obj = Image.open(self.img_path) # 获取验证码图片
img = image_obj.convert("L") # 转灰度
pixdata = img.load()
w, h = img.size
threshold = 160
# 遍历所有像素,大于阈值的为黑色
for y in range(h):
for x in range(w):
if pixdata[x, y] < threshold:
pixdata[x, y] = 0
else:
pixdata[x, y] = 255
return img
def delete_spot(self):
images = self.processing_image()
data = images.getdata()
w, h = images.size
black_point = 0
for x in range(1, w - 1):
for y in range(1, h - 1):
mid_pixel = data[w * y + x] # 中央像素点像素值
if mid_pixel < 50: # 找出上下左右四个方向像素点像素值
top_pixel = data[w * (y - 1) + x]
left_pixel = data[w * y + (x - 1)]
down_pixel = data[w * (y + 1) + x]
right_pixel = data[w * y + (x + 1)]
# 判断上下左右的黑色像素点总个数
if top_pixel < 10:
black_point += 1
if left_pixel < 10:
black_point += 1
if down_pixel < 10:
black_point += 1
if right_pixel < 10:
black_point += 1
if black_point < 1:
images.putpixel((x, y), 255)
black_point = 0
images.show()
new_img_path = ''.join(self.img_path.split('.jpg')[:-1]) + '_new.jpg'
images.save(new_img_path)
return new_img_path
def image_str(self):
new_img_path = self.delete_spot()
image = Image.open(new_img_path) # 读取处理后的图片
result = pytesseract.image_to_string(image) # 图片转文字
resultj = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", result) # 去除识别出来的特殊字符
result_four = resultj[0:4] # 只获取前4个字符
return result_four
if __name__ == '__main__':
path = './result/code.jpg'
a = VerificationCode(path)
result_four = a.image_str()
print("识别结果为:", result_four) # 打印识别的验证码
主文件是在古诗文网验证码识别.py
文件里,具体代码如下:
import requests
from lxml import etree
from VerificationCode import VerificationCode
def getCodeText(imgPath):
"""
封装识别验证码图片的函数
:param imgPath:验证码图片路径
:return: 返回识别的验证码文本
"""
a = VerificationCode(imgPath)
result = a.image_str()
return result
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
# 将验证码图片下载到本地
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
page_text = requests.get(url=url,headers=headers).text
# 解析验证码图片img中src属性值
tree = etree.HTML(page_text)
code_img_src = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
img_data = requests.get(url=code_img_src,headers=headers).content # content存的是二进制的数据
# 将验证码图片保存到本地
with open('./result/code.jpg','wb') as fp:
fp.write(img_data)
# 调用OCR图片识别代码进行验证码图片数据识别
code_text = getCodeText('./result/code.jpg')
print("识别结果为:",code_text)
如果本文对你有帮助,记得“点赞”哦~