通过PIL库中的Image对图像进行处理,利用sklearn库中的MLPClassifier模块进行训练,运用sklearn.externals中的joblib对训练好的模型进行保存,训练通过建造100个神经元以学习率为0.001迭代2000次的效果对62000张手写体图片进行训练,以下为实体代码
首先对图片进行处理包括图像二值化,降噪,分割(分割方法参考x投影方法,滴水分割法)
import requests
from os import listdir
from PIL import Image
import numpy as np
#图像爬取
def getpic(num):
for i in range(num):
url = 'http://jwxt.nwu.edu.cn/(4l4pgh5532jmfv45f03zkcjs)/CheckCode.aspx'
r = requests.get(url)
path='C:/file/机器学习/课程数据/手写数字/yanzhenma/验证码图片/'+str(i)+'.jpg'
with open(path, 'wb') as fd:
fd.write(r.content)
fd.close()
return 0
#图像二值化
def picchange(path):
im = Image.open(path)
Lim = im.convert('L')
name1 = 'C:/file/机器学习/课程数据/手写数字/yanzhenma/二值化图片/1.jpg'
Lim.save(name1)
pic = 185
label = []
for a in range(256):
if a <= 185:
label.append(0)
else:
label.append(1)
bim = Lim.point(label, '1')
name2 = 'C:/file/机器学习/课程数据/手写数字/yanzhenma/二值化图片/2.jpg'
bim.save(name2)
return 0
picchange("C:/file/机器学习/课程数据/手写数字/yanzhenma/验证码图片/2.jpg")
#图像降噪
def deletdot(path):
im = Image.open(path)
pix = im.load()
c = Image.new("RGB", (72, 27))
flag = 0
for x in range(0,72):
for y in range(0,27):
if 1<=x<=70 and 1<=y<=25:
for i in range(x - 1, x + 2):
for j in range(y - 1, y + 2):
print(i, j)
if pix[i, j] <= 40:
flag += 1
else:
pass
if flag >= 2:
c.putpixel([x, y], (0, 0, 0))
flag = 0
else:
c.putpixel([x, y], (255, 255, 255))
flag = 0
else:
c.putpixel([x, y], (255, 255, 255))
name= 'C:/file/机器学习/课程数据/手写数字/yanzhenma/降噪/1.jpg'
c.save(name)
return 0
deletdot("C:/file/机器学习/课程数据/手写数字/yanzhenma/二值化图片/1.jpg")
#图像分割
def piccut(path):
data=[]
blackdot=[]
flag=[]
im = Image.open(path)
pix = im.load()
a,b=im.size
for x in range(a):
for y in range(b):
if max(pix[x,y])<=100:
data.append(1)
else:
data.append(0)
data=np.array(data)
data=data.reshape(a,b)
for i in range(a):
blackdot.append(sum(data[i]))
for i in range(a):
if blackdot[i]!=0:
num1=i
flag.append(num1)
break
for i in range(a-1,-1,-1):
if blackdot[i] != 0:
num2 = i
break
for i in range(num1,num2):
if blackdot[i]==0 and blackdot[i-1]!=0:
flag.append(i)
flag.append(num2)
if len(flag)==5:
for i in range(4):
img=im.crop((flag[i],0,flag[i+1]+1,27))
img.save('C:/file/机器学习/课程数据/手写数字/yanzhenma/分割/' +str(i)+'.jpg')
return data,blackdot,flag
else:
return data,blackdot
print(piccut("C:/file/机器学习/课程数据/手写数字/yanzhenma/降噪/1.jpg"))
学习过程(耗时较长,后考虑KNN算法)
import numpy as np
from os import listdir
from sklearn.neural_network import MLPClassifier
from PIL import Image
from sklearn.externals import joblib
#图像二进制
def changepic(path):
filelist=listdir(path)
num=str(path)
num=reclass(num[-1])
for i in range(1000):
data=[]
print(filelist[i])
im = Image.open(path+'/'+filelist[i])
im=im.rotate(270)
im=im.transpose(Image.FLIP_LEFT_RIGHT)
pix = im.load()
a,b=im.size
path2='C:/file/机器学习/result/'+str(num)+'_'+str(i)+'.txt'
fd=open(path2,'a+')
for x in range(a):
for y in range(b):
if pix[x,y]>=100:
fd.write('1')
else:
fd.write('0')
fd.write('\n')
return 0
def allpic(path):
filelist = listdir(path)
for i in range(len(filelist)):
path2=path+'/'+filelist[i]
changepic(path2)
return 0
#建立模型
def readpic(path):
data=np.zeros([784],int)
fd=open(path)
lines=fd.readlines()
for i in range(28):
for j in range(28):
data[i*28+j]=lines[i][j]
return data
def readdata(path):
filelist=listdir(path)
num=len(filelist)
data=np.zeros([num,784],int)
datalabel=np.zeros([num,62])
for i in range(num):
print(i)
filepath=filelist[i]
digit=int(filepath.split('_')[0])
datalabel[i][digit]=1.0
data[i]=readpic(path+'/'+filepath)
return data,datalabel
#建立标签
def reclass(str):
if int(ord(str))<=57:
num=int(ord(str))-48
return num
elif int(ord(str))>=97:
num=int(ord(str))-61
return num
else:
num=int(ord(str))-55
return num
#标签读取
def reclass2(num):
if 0<=num<=9:
str=chr(num+48)
return str
elif 36<=num:
str=chr(num+61)
return str
else:
str=chr(num+55)
return str
#学习(时间较长2h)
# allpic('C:/file/机器学习/手写字母/Img2')
# allpic('C:/Users/邱星晨/Downloads/157670424Img2/Img2')
# traindata,trainlabel=readdata("C:/file/机器学习/result")
# clf=MLPClassifier(hidden_layer_sizes=(100,),\
# activation='logistic',solver='adam',\
# learning_rate_init=0.001,max_iter=2000)
# clf.fit(traindata,trainlabel)
# joblib.dump(clf, 'C:/file/机器学习/课程数据/手写数字/model/model2.m')
#训练
clf = joblib.load("C:/file/机器学习/课程数据/手写数字/model/model2.m")
data,label=readdata('C:/file/机器学习/data')
result=clf.predict(data)
print(result)
for i in range(62):
if result[0][i]==1:
print(reclass2(i))
break
#训练结果由于电脑内存溢出尚未寻得解决方案
前期学习的一些笔记
# 波士顿房价数据
# from sklearn.datasets import load_boston
# data,target=load_boston(return_X_y=True)
# print(data.shape)
# print(target.shape)
# 波士顿房价数据
# from sklearn.datasets import load_boston
# boston=load_boston()
# print(boston.data.shape)
# 鸢尾花数据集
# from sklearn.datasets import load_iris
# iris=load_iris()
# print(iris.data.shape)
# print(iris.target.shape)
# print(list(iris.target_names))
# 手写体识别
# from sklearn.datasets import load_digits
# import matplotlib.pyplot as plt
# digits=load_digits()
# print(digits.data.shape)
# print(digits.target.shape)
# print(digits.images.shape)
# plt.matshow(digits.images[1000])
# plt.show()
# kMeans算法
# import numpy as np
# from sklearn.cluster import KMeans
# def loaddata(path):
# fd=open(path,'r+')
# lines=fd.readlines()
# cdata=[]
# cname=[]
# for line in lines:
# items=line.strip().split(",")
# cname.append(items[0])
# for i in range(1,len(items)):
# cdata.append([float(items[i])])
# return cdata,cname
# path='C:/file/机器学习/课程数据/聚类/city.txt'
# print(loaddata(path))
# data,cityname=loaddata(path)
# km=KMeans(n_clusters=7)
# print(km)
# label=km.fit_predict(data)
# print(label)
# expenses=np.sum(km.cluster_centers_,axis=1)
# print(expenses)
# city=[[],[],[],[],[],[],[]]
# for i in range(len(cityname)):
# print(i)
# city[label[i]].append(cityname[i])
# for i in range(len(city)):
# print(city[i],expenses[i])
# DBSCAN算法
# import numpy as np
# import sklearn.cluster as sky
# from sklearn import metrics
# from sklearn.cluster import DBSCAN
# import matplotlib.pyplot as plt
# def onlinedata(path):
# mac2=dict()
# onlinetimes=[]
# fd=open(path,'r+',encoding='utf-8')
# lines=fd.readlines()
# for line in lines:
# mac=line.split(',')[2]
# onlinetime=int(line.split(',')[6])
# starttime=int(line.split(',')[4].split(' ')[1].split(':')[0])
# if mac not in mac2:
# mac2[mac]=len(onlinetimes)
# onlinetimes.append((starttime,onlinetime))
# else:
# onlinetimes[mac2[mac]]=[(starttime,onlinetime)]
# return mac2,onlinetimes
# mac2,onlinetimes=onlinedata("C:/file/机器学习/课程数据/聚类/TestData.txt")
# print(mac2,onlinetimes)
# realx=np.array(onlinetimes).reshape((-1,2))
# x=realx[:,0:1]
# print(realx,x)
# db=sky.DBSCAN(eps=0.01,min_samples=20).fit(x)
# print(db)
# labels=db.labels_
# print(labels)
# raito=len(labels[labels[:]==-1])/len(labels)
# print('noise raito:',format(raito,'.2%'))
# print(len(set(labels)))
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# print(n_clusters_)
# print('Estimated number of clusters: {0:2f}'.format(n_clusters_))
# num=metrics.silhouette_score(x, labels)
# print("Silhouette Coefficient: {0:3f}".format(num))
# for i in range(n_clusters_):
# print('Cluster',i,':')
# print(list(x[labels == i].flatten()))
# plt.hist(x,24)
# plt.show()
#PCA算法
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA
# from sklearn.datasets import load_iris
# data=load_iris()
# print(data)
# y=data.target
# x=data.data
# print(x,y)
# pca=PCA(n_components=2)#主成分设置
# reduced=pca.fit_transform(x)#降维后数据
# print(reduced)
# redx,redy=[],[]
# bluex,bluey=[],[]
# greenx,greeny=[],[]
# for i in range(len(reduced)):
# if y[i]==0:
# redx.append(reduced[i][0])N
# redy.append(reduced[i][1])
# elif y[i]==1:
# bluex.append(reduced[i][0])
# bluey.append(reduced[i][1])
# else:
# greenx.append(reduced[i][0])
# greeny.append(reduced[i][1])
# plt.scatter(redx,redy,c='r',marker='x')
# plt.scatter(bluex,bluey,c='b',marker='D')
# plt.scatter(greenx,greeny,c='g',marker='.')
# plt.show()
#NMF算法
# import matplotlib.pyplot as plt
# from sklearn import decomposition #PCA算法
# from sklearn.datasets import fetch_olivetti_faces
# from numpy.random import RandomState
# import numpy as np
# n_row,n_col=2,3
# n_components=n_row*n_col
# image_shape=(64,64)
# dataset=fetch_olivetti_faces(shuffle=True,random_state=RandomState(0))
# faces=dataset.data
# print(faces)
# def plot1(title,images,n_col=n_col,n_row=n_row):
# plt.figure(figsize=(2*n_col,2.26*n_row))
# plt.suptitle(title,size=16)
# for i,comp in enumerate(images):
# plt.subplot(n_row,n_col,i+1)
# vmax=max(comp.max(),-comp.min())
# plt.imshow(comp.reshape(image_shape),cmap=plt.cm.gray,interpolation='nearest', vmin=-vmax, vmax=vmax)
# plt.xticks(())
# plt.yticks(())
# plt.subplots_adjust(left=0.2, bottom=0.2, right=0.8, top=0.8,hspace = 0.2, wspace = 0.3)
# plot1("First centered Olivetti faces", faces[:n_components])
# # print(plot1("First centered Olivetti faces", faces[:n_components]))
# estimators = [('Eigenfaces - PCA using randomized SVD',decomposition.PCA(n_components=6,whiten=True)),('Non-negative components - NMF',decomposition.NMF(n_components=6, init='nndsvda', tol=5e-3))]
# print(type(estimators))
# for name,estimator in estimators:
# print(n_components, name)
# print(faces.shape)
# estimator.fit(faces)
# components_ = estimator.components_
# plot1(name,components_[:n_components])
# plt.show()
#图像分割算法
# import numpy as np
# import PIL.Image as image
# from sklearn.cluster import KMeans
# def loaddata(path):
# fd=open(path,"rb")
# data=[]
# img=image.open(fd)
# m,n=img.size
# for i in range(m):
# for j in range(n):
# x,y,z=img.getpixel((i,j))
# data.append([x/256.000,y/256.000,z/256.000])
# fd.close()
# return np.mat(data),m,n
# imgdata,x,y=loaddata("C:/file/机器学习/课程数据/基于聚类的整图分割/2.jpg")
# km=KMeans(n_clusters=3)
# label=km.fit_predict(imgdata)
# label=label.reshape([x,y])
# newpic=image.new('L',(x,y))
# x1=[]
# for i in range(x):
# for j in range(y):
# num=256/(label[i][j]+1)
# newpic.putpixel((i,j),int(num))
# if num not in x1:
# x1.append(num)
# print(x1)
# newpic.save('C:/file/机器学习/课程数据/基于聚类的整图分割/person(1).jpg','JPEG')
#监督学习KNN分类器
# from sklearn.neighbors import KNeighborsClassifier
# x=[[0],[1],[2],[3]]
# y=[0,0,1,1]
# neigh=KNeighborsClassifier(n_neighbors=3)
# neigh.fit(x,y)
# print(neigh.predict([[1.5]]))
#决策树分类器
# from sklearn.datasets import load_iris
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import cross_val_score
# clf=DecisionTreeClassifier()
# iris=load_iris()
# cross_val_score(clf,iris.data,iris.target,cv=10)
# print(cross_val_score(clf,iris.data,iris.target,cv=10))
# 朴素贝叶斯方法生成学习方法小规模任务
# import numpy as np
# from sklearn.naive_bayes import GaussianNB
# x=np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
# y=np.array([1,1,1,2,2,2,])
# clf=GaussianNB(priors=None)
# clf.fit(x,y)
# print(clf.predict([[-0.8,-1]]))
#上证指数预测
# import numpy as np
# import pandas as pd
# from sklearn import svm
# from sklearn.model_selection import train_test_split
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=196)
# #读取数据
# data=pd.read_csv("C:/file/机器学习/课程数据/分类/stock/000777.csv",encoding='gbk',parse_dates=[0],index_col=0)
# #排序
# data.sort_index(0,ascending=True,inplace=True)
# #选取数据
# dayfeature=150
# featurenum=5*dayfeature
# x=np.zeros((data.shape[0]-dayfeature,featurenum+1))
# y=np.zeros((data.shape[0]-dayfeature))
# for i in range(0,data.shape[0]-dayfeature):
# x[i,0:featurenum]=np.array(data[i:i+dayfeature][[u'收盘价',u'最高价',u'最低价',u'开盘价',u'成交量']]).reshape((1,featurenum))
# x[i,featurenum]=data.ix[i+dayfeature][u'开盘价']
# for i in range(0,data.shape[0]-dayfeature):
# if data.ix[i+dayfeature][u'收盘价']>=data.ix[i+dayfeature][u'开盘价']:
# y[i]=1
# else:
# y[i]=0
# clf=svm.SVC(kernel='rbf')
# result=[]
# for i in range(5):
# a,b,c,d=train_test_split(x,y,test_size=0.2)
# clf.fit(a,c)
# result.append(np.mean(d==clf.predict(b)))
# print(result)
# 线性回归
# from sklearn import linear_model
# import matplotlib.pyplot as plt
# import numpy as np
# x=[]
# y=[]
# fd=open("C:/file/机器学习/课程数据/回归/prices.txt",'r')
# lines=fd.readlines()
# for line in lines:
# items=line.strip().split(',')
# x.append(int(items[0]))
# y.append(int(items[1]))
# length=len(x)
# x=np.array(x).reshape([length,1])
# y=np.array(y)
# print(x,y)
# xmin=min(x)
# xmax=max(x)
# X=np.arange(xmin,xmax).reshape([-1,1])
# #调用线性回归模块fit_intercept截距normalize为true进行归一化
# linear=linear_model.LinearRegression()
# linear.fit(x,y)
# print('回归系数:',linear.coef_)
# print('截距:',linear.intercept_)
# plt.scatter(x,y,color='red')
# plt.plot(X,linear.predict(X),color='blue')
# plt.xlabel('Area')
# plt.ylabel('Price')
# plt.show()
#岭回归优化(alpha对应损失函数中α)(fit_intercept计算截距)(solver设置计算参数)非线性计算
# import pandas as pd
# import numpy as np
# #加载岭回归
# from sklearn.linear_model import Ridge
# #加载交叉验证
# from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt
# #创建多项式特征
# from sklearn.preprocessing import PolynomialFeatures
# data=np.genfromtxt("C:/file/机器学习/课程数据/回归/岭回归.txt",delimiter = ',',dtype = None)
# #保存0-3维数据属性
# x=data[:,:4]
# #保存车流量
# y=data[:,5]
# #创建最高次数为6的多项式
# ploy=PolynomialFeatures(6)
# #多项式特征
# x=ploy.fit_transform(x)
# a,b,c,d=train_test_split(x,y,test_size=0.3,random_state=0)
# #创建岭回归
# clf=Ridge(alpha=1.0,fit_intercept=True)
# #训练
# clf.fit(a,c)
# #拟合优度最大为1
# num=clf.score(b,d)
# print(num)
# num1=clf.predict(x)
# time=np.arange(200,300)
# plt.plot(time,y[200:300],'b',label='real')
# plt.plot(time,num1[200:300],'r',label='predict')
# #设置图形位置
# plt.legend(loc='upper lrft')
# plt.show()