使用模块: pytesseract
依赖: tesseract-ocr和语言包
在不经过额外训练的情况下可以使用pytesseract可以识别简单的验证码,比如常见的数字、字母验证码。
此类验证码只需使用Image.convert进行简单灰度化然后就识别了,convert图片格式转化
举例:
im = Image.open('image.png')
imgry = im.convert('L')
上图中左图为imgey.show(),右图为原图,获取图片每个像素点的值:
In [18]: print list(imgry.getdata())
[218, 231, 233, 207, 228, 228, 226, 183, 196, 215, 234, 225, 218, 204, 191, 223, 220, 232, 231, 229, 225, 226, 229, 222, 225, 224, 246, 212, 198, 214,
239, 230, 225, 236, 220, 230, 229, 232, 183, 222, 226, 237, 181, 219, 239, 217, 231, 224, 226, 225, 196, 221, 177, 229, 190, 217, 223, 191, 228, 220,
226, 193, 215, 225, 231, 218, 230, 227, 178, 207, 186, 176, 194, 218, 223, 189, 198, 216, 173, 226, 207, 238, 230, 229, 225, 225, 202, 235, 193, 227,
215, 217, 229, 201, 198, 188, 175, 174, 241, 189, 223, 177, 239, 194, 229, 230, 218, 224, 227, 220, 216, 194, 228, 194, 220, 200, 227, 208, 195, 220,
220, 222, 196, 231, 232, 175, 217, 228, 231, 202, 215, 235, 226, 198, 180, 190, 217, 229, 203, 228, 237, 221, 219, 184, 232, 229, 237, 218, 222, 186,
237, 231, 193, 244, 211, 236, 230, 227, 195, 182, 186, 196, 190, 205, 172, 239, 185, 231, 218, 235, 224, 220, 185, 189, 172, 197, 224, 241, 192, 220,
235, 218, 212, 182, 225, 237, 176, 198, 173, 188, 182, 238, 221, 181, 231, 225, 195, 182, 183, 186, 202, 176, 220, 242, 183, 220, 231, 238, 225, 183,
222, 211, 214, 180, 228, 220, 231, 229, 220, 228, 187, 231, 224, 212, 239, 229, 190, 219, 216, 222, 227, 227, 242, 213, 202, 180, 195, 230, 205, 192,
179, 226, 240, 216, 197, 206, 211, 220, 243, 220, 197, 219, 209, 195, 228, 227, 217, 221, 213, 225, 214, 232, 193, 176, 199, 185, 168, 238, 234, 176,
197, 234, 228, 197, 234, 215, 226, 231, 212, 240, 184, 185, 222, 176, 230, 214, 198, 227, 228, 243, 205, 238, 222, 210, 224, 195, 209, 208, 233, 197,
233, 170, 231, 224, 189, 239, 202, 192, 232, 220, 183, 234, 234, 191, 227, 219, 227, 202, 206, 229, 229, 194, 186, 202, 178, 211, 186, 181, 180, 186,
194, 179, 220, 221, 195, 223, 179, 233, 222, 216, 192, 238, 195, 240, 169, 202, 181, 186, 228, 176, 233, 214, 225, 241, 227, 210, 203, 161, 198, 182,
237, 209, 196, 241, 206, 184, 200, 191, 212, 238, 75, 91, 88, 63, 229, 227, 224, 244, 214, 180, 248, 226, 205, 51, 50, 51, 52, 181, 193, 202, 173, 23
8, 181, 242, 215, 194, 228, 169, 52, 41, 47, 203, 178, 233, 213, 201, 198, 185, 186, 73, 69, 75, 68, 210, 218, 243, 203, 183, 187, 191, 221, 242, 179,
220, 234, 188, 183, 228, 187, 87, 83, 75, 81, 83, 86, 222, 221, 233, 174, 234, 183, 44, 69, 52, 64, 34, 58, 63, 176, 183, 194, 185, 189, 184, 191, 22
7, 189, 198, 35, 35, 37, 229, 234, 181, 233, 225, 169, 241, 75, 52, 66, 67, 70, 59, 243, 210, 184, 223, 185, 191, 230, 224, 219, 203, 172, 192, 190, 2
31, 74, 90, 191, 211, 200, 224, 84, 87, 219, 221, 195, 237, 213, 46, 63, 216, 214, 205, 172, 60, 51, 185, 189, 188, 191, 186, 206, 199, 185, 37, 43, 3
4, 36, 189, 213, 196, 200, 206, 196, 69, 56, 243, 185, 172, 233, 72, 63, 225, 183, 224, 239, 195, 224, 235, 227, 208, 198, 222, 189, 187, 88, 77, 196,
190, 185, 226, 77, 62, 201, 191, 178, 181, 65, 55, 168, 234, 239, 199, 199, 226, 53, 42, 233, 175, 181, 204, 199, 183, 33, 33, 188, 32, 24, 201, 181,
197, 172, 215, 224, 56, 73, 175, 197, 192, 221, 63, 51, 237, 201, 224, 212, 197, 222, 225, 223, 228, 223, 213, 190, 219, 77, 75, 174, 211, 201, 162,
90, 96, 178, 229, 231, 219, 49, 48, 204, 183, 229, 233, 200, 185, 48, 64, 190, 230, 212, 222, 214, 181, 50, 37, 230, 44, 44, 192, 199, 179, 229, 165,
76, 65, 183, 189, 189, 186, 186, 225, 79, 61, 223, 233, 196, 224, 226, 224, 221, 232, 230, 201, 189, 234, 196, 91, 91, 68, 227, 84, 76, 227, 226, 189,
214, 237, 48, 42, 218, 189, 187, 186, 217, 186, 54, 53, 216, 190, 204, 226, 230, 42, 24, 245, 181, 30, 34, 183, 188, 200, 192, 239, 67, 57, 239, 165,
197, 196, 232, 195, 47, 63, 215, 231, 179, 225, 226, 239, 230, 220, 231, 220, 200, 182, 197, 178, 79, 68, 83, 95, 173, 189, 223, 189, 182, 199, 46, 5
7, 64, 224, 186, 199, 190, 44, 59, 58, 231, 223, 217, 181, 35, 33, 197, 176, 234, 28, 36, 200, 183, 212, 187, 182, 65, 67, 224, 240, 190, 163, 224, 24
1, 62, 69, 201, 231, 180, 206, 223, 229, 226, 219, 232, 227, 226, 192, 84, 75, 80, 217, 62, 78, 99, 80, 235, 222, 205, 219, 193, 36, 51, 44, 56, 41, 6
0, 64, 40, 60, 186, 179, 180, 238, 48, 22, 194, 201, 174, 49, 44, 192, 227, 186, 233, 185, 71, 66, 210, 246, 179, 252, 177, 226, 56, 73, 188, 219, 187
, 195, 224, 216, 222, 225, 230, 222, 234, 183, 76, 76, 194, 168, 203, 183, 77, 77, 215, 221, 188, 211, 238, 238, 53, 55, 48, 65, 40, 224, 73, 18, 195,
210, 236, 16, 49, 26, 37, 32, 41, 37, 44, 8, 53, 227, 222, 235, 63, 61, 200, 178, 180, 234, 194, 182, 80, 49, 187, 200, 185, 180, 225, 228, 230, 214,
226, 236, 216, 85, 86, 191, 189, 196, 187, 182, 207, 73, 85, 188, 193, 188, 230, 205, 221, 187, 183, 232, 198, 175, 52, 59, 195, 175, 185, 47, 26, 49
, 40, 33, 37, 40, 38, 48, 28, 235, 226, 217, 77, 63, 221, 231, 187, 191, 220, 188, 58, 75, 226, 212, 201, 183, 221, 231, 221, 229, 231, 193, 218, 82,
75, 232, 186, 199, 180, 190, 195, 86, 79, 186, 185, 195, 198, 216, 225, 230, 181, 192, 181, 52, 47, 195, 184, 194, 180, 240, 220, 182, 200, 179, 229,
39, 28, 201, 219, 230, 233, 207, 241, 65, 65, 221, 198, 182, 225, 59, 72, 222, 223, 220, 198, 181, 233, 180, 190, 212, 223, 188, 219, 85, 92, 77, 181,
182, 204, 196, 67, 80, 87, 229, 232, 207, 191, 43, 186, 197, 194, 211, 38, 67, 48, 224, 194, 213, 185, 180, 186, 203, 203, 233, 193, 32, 54, 186, 225
, 219, 236, 194, 226, 61, 69, 219, 187, 232, 229, 49, 71, 225, 222, 231, 217, 194, 223, 226, 187, 199, 182, 194, 188, 184, 54, 109, 76, 90, 70, 72, 89
, 81, 220, 227, 215, 240, 227, 66, 44, 49, 37, 57, 53, 50, 222, 195, 226, 229, 224, 243, 193, 172, 236, 229, 208, 44, 30, 213, 207, 189, 184, 180, 198
, 212, 63, 82, 62, 49, 79, 69, 212, 231, 222, 221, 200, 180, 218, 240, 227, 180, 230, 222, 185, 190, 198, 79, 75, 79, 93, 82, 79, 187, 232, 221, 230,
220, 212, 219, 69, 39, 67, 39, 235, 205, 182, 185, 175, 236, 224, 227, 209, 198, 219, 188, 216, 47, 42, 190, 182, 195, 224, 226, 175, 210, 217, 59, 72
, 62, 60, 243, 212, 222, 231, 229, 209, 196, 218, 230, 222, 232, 188, 237, 178, 192, 216, 204, 187, 219, 190, 224, 181, 197, 195, 179, 194, 192, 233,
220, 218, 234, 218, 228, 223, 213, 245, 185, 205, 172, 187, 192, 185, 201, 227, 239, 194, 212, 236, 221, 233, 190, 222, 226, 237, 212, 211, 198, 237,
210, 173, 223, 189, 233, 217, 232, 235, 227, 230, 216, 242, 217, 208, 191, 182, 230, 187, 208, 193, 229, 190, 197, 190, 183, 221, 232, 227, 175, 201,
190, 231, 213, 223, 235, 227, 231, 214, 229, 217, 204, 232, 207, 198, 188, 176, 188, 195, 194, 221, 210, 244, 209, 188, 189, 196, 184, 204, 204, 209,
177, 231, 189, 232, 186, 203, 215, 219, 225, 227, 217, 209, 237, 240, 239, 169, 198, 235, 196, 189, 167, 224, 178, 241, 189, 187, 169, 198, 206, 204,
193, 188, 232, 218, 246, 216, 211, 235, 229, 214, 223, 181, 242, 223, 182, 228, 215, 233, 208, 190, 215, 209, 220, 193, 230, 182, 187, 253, 179, 201,
214, 199, 233, 167, 199, 212, 208, 200, 208, 227, 191, 230, 235, 209, 212, 206, 177, 182, 190, 185, 238, 186, 234, 187, 187, 204, 183, 189, 180, 196,
192, 189, 188, 194, 177, 222, 242, 221, 216, 239, 224, 190, 218, 226, 193, 187, 237, 206, 237, 228, 215, 230, 230, 187, 226, 221, 190, 170, 234, 185,
191, 187, 221, 190, 187, 202, 203, 195, 190, 227, 195, 221, 224, 231, 221, 173, 201, 190, 222, 232, 192, 223, 197, 187, 219, 221, 225, 199, 193, 185,
196, 186, 191, 230, 224, 210, 240, 227, 224, 226, 224, 225, 192, 215, 189, 197, 192, 223, 227, 219, 231, 201, 219, 197, 226, 211, 234, 194, 217, 221,
198, 184, 189, 194, 188, 192, 186, 173, 202, 224, 210, 202, 222, 228, 198, 182, 191, 189, 226, 228, 210, 208, 211, 183, 208, 229, 232, 216, 212, 196,
215, 208, 198, 194, 193, 188, 226, 221, 224, 228, 226, 220, 194, 233, 220, 221, 190, 195, 186, 201, 219, 208, 200, 205, 202, 200, 224, 184, 197, 199,
186, 181, 187, 189, 183, 197, 195, 190, 228]
然后根据像素点颜色的深浅过滤掉噪点,至于阈值只能不断尝试,
上图为flag分别为10、50、80、120、160和200时的图片
imgry.point(lambda x: 0 if x < flag else 1, '1').show()
导入pytesseract后即可识别图片,flag在110-175都可以识别出来
In [29]: pytesseract.image_to_string(imgry.point(lambda x: 0 if x < 140 else 1, '1'))
Out[29]: u'8940'
此类验证码需要去除干扰线,按照颜色过滤,可以观察到左图验证码数字为红色,事实上我在登陆界面尝试多次都是红色,于是:
左图为原验证码, 右图为安颜色过滤后的图片,代码如下
image = Image.open('image.jpg')
image_array = np.array(image)
for i, i_vaue in enumerate(image_array):
for j, j_value in enumerate(i_vaue):
r, g, b = j_value
if r < 100:
image_array[i][j] = np.array([255, 255, 255], dtype='uint8')
else:
if g > 125:
image_array[i][j] = np.array([255, 255, 255], dtype='uint8')
ff = Image.fromarray(image_array)
至于r和g的范围,则是根据取色器得出的rgb调试出来的,然后识别:
In [21]: pytesseract.image_to_string(ff)
Out[21]: u'2480'
此类验证码没有颜色规律,去除噪点需要根据相邻点的值决定该点的颜色,
image = Image.open(BytesIO(content))
image_array = np.array(image)
filter_value = 100
for i, i_vaue in enumerate(image_array):
for j, j_value in enumerate(i_vaue):
if j_value > filter_value:
image_array[i][j] = 255
else:
image_array[i][j] = 0
im = Image.fromarray(image_array)
发现有噪点,然后再将孤立的像素点设置为白色
w, h = im.size
for j in range(1, (h - 1)):
for i in range(1, (w - 1)):
if im.getpixel((i, j)) == 0:
result = im.getpixel(((i - 1), (j - 1))) + im.getpixel(((i - 1), j)) + im.getpixel(
((i - 1), (j + 1))) + im.getpixel((i, (j - 1))) + im.getpixel((i, (j + 1))) + im.getpixel(
((i + 1), (j - 1))) + im.getpixel(((i + 1), j)) + im.getpixel(((i + 1), (j + 1)))
if result > 6*255:
im.putpixel([i, j], 255)
效果看似不错,然后识别:
In [42]: pytesseract.image_to_string(im)
Out[42]: u'5729'
至此,我所遇到的能识别出来的就这么多,当然有些验证码的识别率不高但是用于爬虫是够了的,一般尝试10次都是可以通过的。
如果比较难的就只能用Tesseract训练或者是用Tensorflow训练了。