-开发环境: python 3.8
-库:PIL,matplotlib.pyplot,sklearn,opencv等
-编译器:pycharm
1.获得想要爬取验证码的网址
(由此图片给出的信息,由此可以判断,想要的验证码的图片需要发起请求的网址)
2.爬取图片
url = "http://117.21.221.123/gnyxy/other/CheckCode.aspx"#需要发起请求的网址
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1;Win64;x64;rv:81.0) Gecko/20100101 Firefox/81.0",
"Cookie": "ASP.NET_SessionId=mzmcjd45dyhbow55al3w4xmm; rootPath=http://117.21.221.123/gnyxy",
"Referer": "http://117.21.221.123/gnyxy/"
}#设置请求头
response = requests.get(url=url, headers=header).content#获得响应内容
with open(path, 'wb')as fp:
fp.write(response)#将图片保存至指定路径下
这样就就能批量获得验证码了
之前爬取的验证码有大量的干扰线与干扰点,并且验证码颜色较为复杂,所以需要对图片进行加工
(类似图片)
1.灰度化处理
实现代码:
lim = image.convert('L')
pixdata = lim.load()#创建一个二维列表存放图片每个像素的RGB值
或者
im = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
2.二值化处理
实现代码:
w, h = lim.size#此处lim指的是灰度处理后图片对象
# 遍历所有像素,大于阈值的为黑色
for y in range(h):
for x in range(w):
if pixdata[x, y] < threshold:#threshold指的是一定阈值
pixdata[x, y] = 0#将像素改为黑色
else:
pixdata[x, y] = 255
(或者)
lim = lim.point(lambda x: 255 if x > 134 else 0)
因为只是进行简单灰度处理,所以导致图片还有大量的干扰点与干扰线,会导致出现下面图片类似的情况
3. 降噪
原理:
降噪当前已经有了较好的算法能处理此类问题--------洪水填充法 洪水填充法洪水填充法分为四邻域与八邻域算法。笔者在这里使用的是八邻域算法。
实现代码:
for i in range(0, Z):
pixdata[0, 0] = 255 # 将左顶点改为白色色块
pixdata[(image.size[0] - 1, image.size[1] - 1)] = 255 # 将右脚点改为白色模块
for x in range(1, image.size[0] - 1):
for y in range(1, image.size[1] - 1):
nearDots = 0
L = pixdata[x, y] # 获去该坐标的颜色RGB是否是1还是0
# 用的是八领域算法
if L == pixdata[x - 1, y - 1]:
nearDots += 1
if L == pixdata[x - 1, y]:
nearDots += 1
if L == pixdata[x - 1, y + 1]:
nearDots += 1
if L == pixdata[x, y - 1]:
nearDots += 1
if L == pixdata[x, y + 1]:
nearDots += 1
if L == pixdata[x + 1, y - 1]:
nearDots += 1
if L == pixdata[x + 1, y]:
nearDots += 1
if L == pixdata[x + 1, y + 1]:
nearDots += 1
if nearDots < N: # 有四个相邻的点与该点的颜色是一样的
pixdata[x, y] = 255 # 判断如果有的话,将其变为白点
效果图如下
(图片处理好了,可是为了简化对象,所以我们需要将图片进行分割)
原理:通过再次加载图片,获取每一个像素点的颜色RGB是1还是0。通过起始寻找上下左右,是否有黑色的像素点。如果有则寻找该像素点的前后左右是否黑色像素点,以此类推,直至找不到黑色点,则为该字符的边界。
实现代码
def get_crop_imgs(im,num,path,pixdata):
zoneWB = CFS(im,pixdata)
# print(zoneWB)
cutting_img(im,num,path,zoneWB)
def CFS(im,pixdata):
'''切割字符位置
'''
xmax = 0 # 上一区块结束黑点横坐标,这里是初始化
zoneWB = [] # 各区块的X轴[起始,终点]列表
for i in range(4):
try:
# print(xmax)
x_fd,y_fd = detectFgPix(im,xmax,pixdata)
# print(y_fd,x_fd)
xmax,xmin=cfs(x_fd,y_fd,pixdata)
# if(xmax>20):
# xmax //= 2
zoneWB.append([xmin,xmax])
except TypeError:
return zoneWB
return zoneWB
def detectFgPix(im,xmax,pixdata):
'''搜索区块起点
'''
h = im.size[1]
w = im.size[0]
for x_fd in range(xmax+1,w):
for y_fd in range(h):
if pixdata[x_fd,y_fd] == 0:
return x_fd,y_fd
def cfs(x_fd,y_fd,pixdata):
'''用队列和集合记录遍历过的像素坐标代替单纯递归以解决cfs访问过深问题
'''
# print('**********')
xaxis=[]
visited = set()
q = Queue()
q.put((x_fd, y_fd))
visited.add((x_fd, y_fd))
offsets=[(1, 0), (0, 1) ,(-1, 0), (0, -1)]#四邻域
while not q.empty():
x,y=q.get()
for xoffset,yoffset in offsets:
x_neighbor = x+xoffset
y_neighbor = y+yoffset
if (x_neighbor,y_neighbor) in (visited):
continue # 已经访问过了
else:
visited.add((x_neighbor, y_neighbor))
try:
if pixdata[x_neighbor, y_neighbor] == 0:
xaxis.append(x_neighbor)
q.put((x_neighbor,y_neighbor))
except IndexError:
pass
# print(xaxis)
if (len(xaxis) == 0 ):
xmax = x_fd + 1
xmin = x_fd
else:
xmax = max(xaxis)
xmin = min(xaxis)
#ymin,ymax=sort(yaxis)
return xmax,xmin
def cutting_img(img,num,path,zoneWB):
img_arr = plt.imread(path)
for i in range(4):
image_child_name = str(num)+'_'+str(i)+'.jpg'
image_path='D://idcodes/' + image_child_name
x_min_postion = zoneWB[i][0] - 1
x_max_postion = zoneWB[i][1] +1
cropped = img_arr[0:22,x_min_postion:x_max_postion]#获取需要剪切图片的位置
cv2.imwrite(image_path,cropped)# tuple_child = (x_min_postion,0,x_max_postion,22)#进行图片的切割,并保存到指定路径下
# plt.savefig(image_path)
# img = img.crop(tuple_child)
# img.save(image_path)
if os.path.exists(path):
os.remove(path)
else:
print('未找到此文件')
分割之后出现类似的图片表示成功了
我在这里运用的是knn算法(分类算法)对验证码的每个字符进行识别以及分类
易错点:因为图片大小不同,所以运用 cv2.imread获取的数据的shape是不同的,所以不能将不同大小的图片作为同一个训练集使用,因为上述分割算法,分出来的图片大小是不一的。所以knn算法是不行的吗?
解决方法:
1.将所有的图片转化为统一大小
img_ls=["2","3","4","5","6","7","8","9","A","B","C","D","E","F","G","H","J","K","L","M","N","P","Q","R","S","T","U","W","X","Y","Z"]
for name in img_ls:
data_width = []
path = os.path.join(img_path, name)
# print(path)
img_names = os.listdir(path)
for i in range(len(img_names)):
path1 = os.path.join(path, img_names[i])
img = Image.open(path1)
type = "png"
out = img.resize((13, 22), Image.ANTIALIAS)
# # resize image with high-quality
os.remove(path1)
out.save(path1, type)
2.分别获得数据的目标值和特征值
data_sum = []
labels = []
img_path = "D:\img_src"
img_ls = os.listdir(img_path)
for name in img_ls:
path = os.path.join(img_path,name)
# print(path)
img_names =os.listdir(path)
for i in range(len(img_names)):
path1 = os.path.join(path,img_names[i])
image = cv2.imread(path1)
image = image.reshape(-1)#将数据转化为一维数组
data_sum.append(image)#提供特征值
labels.append(name)#提供目标值
3.将特征值和目标值数组化并分化成训练集和测试集:
y = LabelBinarizer().fit_transform(labels)#标签规范化
x = np.array(data_sum,dtype=object)
y = np.array(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
4.训练模型:
# 训练KNN分类器
model = neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)
5.测试并保存模型
dirs = 'D:/testModel'
if not os.path.exists(dirs):
os.makedirs(dirs)
joblib.dump(model,dirs + '/LR.pkl')
#测试结果打印
pre_y_train = model.predict(x_train)
pre_y_test = model.predict(x_test)
class_name = [ 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'classa','classb','classc','classd','classe','classf','classg','classh','classj','classk','classl','classm','classn','classp','classq', 'classr','classs','classt','classu','classw','classx','classy','classz']
print(classification_report(y_train, pre_y_train, target_names=class_name))
print(classification_report(y_test, pre_y_test, target_names=class_name))
6.结果截图
改进方法:可以利用网格搜索与交叉验证提高准确率
通过这几个步骤就能得到比较准确的模型了