我们首先需要获取足够的验证码样本,因为从简单做起,所以本次的验证码都是由数字组成且都比较规则; HTTP://smart.gzeis.edu.cn:8081 /内容/校验码。 ASPX

  1. # - * - 编码:utf-8 - * -
  2. 导入 请求
  3. 进口 时间
  4. #文件下载,主要下载训练集
  5. def download_pics (pic_name)
  6. url = 'http: //smart.gzeis.edu.cn:8081/Content/ AuthCode.aspx'
  7. res = requests.get(url,stream = True
  8. 开放( u'J:/数据分析学习/蟒/机器学习之验证码识别/pics/%s.jpg” %(pic_name), 把wb F:
  9. 对于 组块 res.iter_content(CHUNK_SIZE = 1024 ):
  10. 如果 大块:
  11. f.write(块)
  12. f.flush()
  13. f.close()
  14. if __name__ == '__main__':
  15. for i in xrange( 100):
  16. pic_name = int(time.time()* 1000000)
  17. download_pics(pic_name)



  1. # -*- coding: utf-8 -*-
  2. from PIL import Image,ImageEnhance
  3. from PIL import *
  4. import time
  5. # 图片切割
  6. def segment(im):
  7. s = 12
  8. w = 40
  9. h = 81
  10. t = 0
  11. im_new = []
  12. for i in range( 4):
  13. im1 = im.crop((s+w*i,t,s+w*(i+ 1),h))
  14. im_new.append(im1)
  15. return im_new
  16. # 图片预处理,二值化,图片增强
  17. def imgTransfer(f_name):
  18. im = Image.open(f_name)
  19. im = im.filter(ImageFilter.MedianFilter())
  20. #enhancer = ImageEnhance.Contrast(im)
  21. #im = enhancer.enhancer(1)
  22. im = im.convert( 'L')
  23. return im
  24. def cutPictures(img):
  25. im = imgTransfer(img)
  26. pics = segment(im)
  27. for pic in pics:
  28. pic.save( u'J:/数据分析学习/python/机器学习之验证码识别/test/%s.jpeg'%(int(time.time()* 1000000)), 'jpeg')
  29. # 读取某文件夹下的所有图片
  30. import os
  31. def getAllImages(folder):
  32. assert os.path.exists(folder)
  33. assert os.path.isdir(folder)
  34. imageList = os.listdir(folder)
  35. imageList = [os.path.abspath(item) for item in imageList if os.path.isfile(os.path.join(folder, item))]
  36. return imageList
  37. if __name__ == '__main__':
  38. files_name = getAllImages( u'J:/数据分析学习/python/机器学习之验证码识别/pics//')
  39. for i in files_name:
  40. #cutPictures()
  41. files = i.replace( '\\', '/')
  42. s = files.split( '/')
  43. name = ''
  44. for j in s[: -1]:
  45. name = name + j + '/'
  46. name = name + 'pics/' + s[ -1]
  47. cutPictures(name)


这里值得一提的是,首先你得知道文件夹的命名不可以出现特殊符号(/ \ * ! | ? < >)等,而机器识别可能会把切割后图片识别成这些特殊字符,所以得加上一个判断;然后机器的识别正确率大概是50%,所有后面还需自己人工分类,变成准确的分类。(有个奇怪的现象,1200张图片,竟然没有一个9)

  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Mar 23 14:19:13 2017
  4. 对切割后的图片进行分类,及0-9
  5. @author: onlyyo
  6. """
  7. import sys
  8. sys.path.append( 'C:\Users\onlyyo\Desktop\pytesseract-0.1.6\src')
  9. sys.path.append( 'C:\Python27\Lib\site-packages\pytesser')
  10. from pytesser import *
  11. from pytesseract import *
  12. import pytesseract
  13. from PIL import Image
  14. import os
  15. import shutil
  16. #ocr图像识别
  17. def ocr(img):
  18. try:
  19. img = Image.open(img)
  20. rs = image_to_string(img)
  21. except:
  22. return 'none'
  23. return rs
  24. #使用ocr进行训练的预分类
  25. def category(originfile,dirs,filename):
  26. if not os.path.exists(dirs):
  27. os.makedirs(dirs)
  28. shutil.copyfile(originfile,dirs+filename)
  29. if __name__ == '__main__':
  30. dirs = u'J:/数据分析学习/python/机器学习之验证码识别/test/'
  31. # 将ocr识别的文件按照数组编号存放在相应的文件夹中
  32. for fr in os.listdir(dirs):
  33. f = dirs+fr
  34. if f.rfind( u'.DS_Store') == -1:
  35. rs = ocr(f)
  36. if '|' not in rs and '*' not in rs :
  37. if '?' not in rs and '<' not in rs and '>' not in rs:
  38. category(f, u'J:/数据分析学习/python/机器学习之验证码识别/category/%s/'%rs,fr)



  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Mar 23 15:46:59 2017
  4. @author: onlyyo
  5. 批量将切割后并且已经分好类的图像,得到的图片进行二值化处理,变成像素值,然后保存在TXT文件下
  6. """
  7. from PIL import Image
  8. import numpy as np
  9. import os
  10. # 特征提取,获取图像二值化数学值
  11. def getBinaryPix(im):
  12. im = Image.open(im)
  13. img = np.array(im)
  14. rows,cols = img.shape
  15. for i in range(rows):
  16. for j in range(cols):
  17. if (img[i,j]<= 128):
  18. img[i,j] = 0
  19. else:
  20. img[i,j] = 1
  21. binpix = np.ravel(img)
  22. return binpix
  23. def getfiles(dirs):
  24. fs = []
  25. for fr in os.listdir(dirs):
  26. f = dirs + fr
  27. if f.rfind( u'.DS_Store') == -1:
  28. fs.append(f)
  29. return fs
  30. def writeFile(content):
  31. with open( u'J:/数据分析学习/python/机器学习之验证码识别/traindata/train_data.txt', 'a+') as f:
  32. f.write(content)
  33. f.write( '\n')
  34. f.close()
  35. if __name__ == '__main__':
  36. dirs = u'J:/数据分析学习/python/机器学习之验证码识别/category/%s/'
  37. for i in range( 9):
  38. for f in getfiles(dirs %(i)):
  39. pixs = getBinaryPix(f).tolist()
  40. pixs.append(i)
  41. pixs = [str(i) for i in pixs]
  42. content = ','.join(pixs)
  43. writeFile(content)


  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Mar 23 16:18:39 2017
  4. @author: onlyyo
  5. 根据之前得到的验证码像素数组,以及他们的标签,用SVM对其进行模型训练
  6. """
  7. from sklearn.svm import SVC
  8. from sklearn import grid_search
  9. import numpy as np
  10. import numpy as np
  11. from sklearn import cross_validation as cs
  12. from sklearn.externals import joblib
  13. from picPreprocessing import loadPredict
  14. import warnings
  15. import time
  16. def load_data():
  17. dataset = np.loadtxt( u'J:/数据分析学习/python/机器学习之验证码识别/traindata/train_data.txt',delimiter= ',')
  18. return dataset
  19. # 交叉验证
  20. def cross_validation():
  21. dataset = load_data()
  22. row,col = dataset.shape
  23. X = dataset[:,:col -1]
  24. Y = dataset[:, -1]
  25. clf = SVC(kernel= 'rbf',C= 1000)
  26. clf.fit(X,Y)
  27. scores = cs.cross_val_score(clf,X,Y,cv= 5)
  28. print "Accuracy: %0.2f (+- %0.2f)" % (scores.mean(),scores.std())
  29. return clf
  30. t0 = time.time()
  31. cross_validation()
  32. #print "fit time:",round(time.time()-t0,3),"s"
  33. def searchBestParameter():
  34. parameters = { 'kernel':( 'linear', 'poly', 'rbf', 'sigmoid'), 'C':[ 1, 100]}
  35. dataset = load_data()
  36. row,col = dataset.shape
  37. X = dataset[:,:col -1]
  38. Y = dataset[:, -1]
  39. svr = SVC()
  40. clf = grid_search.GridSearchCV(svr,parameters)
  41. clf.fit(X,Y)
  42. print clf.best_params_
  43. #searchBestParameter()
  44. print "fit time:",round(time.time()-t0, 3), "s"



  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Mar 23 17:20:09 2017
  4. @author: onlyyo
  5. 最后一步,对于要测试的验证码处理,然后进行预测,输出结果
  6. """
  7. from split_pic import *
  8. from write_img import *
  9. import os
  10. from cross_svc import cross_validation
  11. def cutPictures2(name):
  12. im = imgTransfer(name)
  13. pics = segment(im)
  14. for pic in pics:
  15. pic.save( u'J:/数据分析学习/python/机器学习之验证码识别/test_picture/%s.jpeg'%(int(time.time()* 1000000)), 'jpeg')
  16. def load_Predict(name):
  17. #
  18. cutPictures2(name) #切割图片
  19. dirs = u'J:/数据分析学习/python/机器学习之验证码识别/test_picture/'
  20. fs = os.listdir(dirs) # 获取图片名称
  21. clf = cross_validation()
  22. predictValue = []
  23. for fname in fs:
  24. fn = dirs + fname
  25. binpix = getBinaryPix(fn)
  26. predictValue.append(clf.predict(binpix))
  27. predictValue = [str(int(i)) for i in predictValue]
  28. 打印 “图片编号为:” “”. join(predictValue)
  29. name = u'J:/数据分析学习/ python /机器学习之验证码识别/8473.jpg'
  30. load_Predict(名称)






