《机器学习技法》作业需要用到决策树。一直以为要用matlab自己实现一个,但今早起床的时候突然想起,不是有个叫sklearn的东西,是否可以直接拿来用呢?
下午做了一些实验,作业里提供的数据,参考sklearn网址提供的代码:
import numpy as np from sklearn import tree import matplotlib.pyplot as plt data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式 train_x = data[:,:-1] train_y = data[:,-1] clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x, train_y) #获取最大最小值,可以用作归一化,也可以用作画图范围(meshgrid) x_min, x_max = train_x[:,0].min()-0.1, train_x[:,0].max()+0.1 y_min, y_max = train_x[:,1].min()-0.1, train_x[:,1].max()+0.1 plot_step = 0.02 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.plot() #xx.ravel(),flatten the array. np.c_[],配对 Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) #画点 color = 'gr' label = [-1, 1] for i in range(2): idx = np.where(train_y==label[i]) plt.scatter(train_x[idx, 0], train_x[idx, 1], c = color[i]) plt.axis("tight") plt.suptitle("Decision surface of a decision tree using") plt.show()自动训练,只需要了解参数设置。可视化结果:
还可以把树用.dot格式存储起来(.txt存起来内容也是一样的),图形的文字型表示,如下图:
还可以用pydot图形化(关于pypot的安装见上一片博客),并存储为pdf格式:
from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('ex.pdf')
此外,还可以保存为图片:
open('a.jpg','wb').write(graph.create_jpg())
类似于matlab画子图plt.subplot(1,2,1)。
作业代码:
from __future__ import division import numpy as np from sklearn import tree import matplotlib.pyplot as plt data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式 train_x = data[:,:-1] train_y = data[:,-1] #训练树 clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x, train_y) #-----------画背景分割图----------- #获取最大最小值,可以用作归一化,也可以用作画图范围(meshgrid) x_min, x_max = train_x[:,0].min()-0.1, train_x[:,0].max()+0.1 y_min, y_max = train_x[:,1].min()-0.1, train_x[:,1].max()+0.1 plot_step = 0.02 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.subplot(1,2,1) #xx.ravel(),flatten the array. np.c_[],配对 Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) #-------------画训练点---------------------- color = 'gr' label = [-1, 1] for i in range(2): idx = np.where(train_y==label[i]) plt.scatter(train_x[idx, 0], train_x[idx, 1], c = color[i]) plt.title('train') #测试 test_data = np.loadtxt('hw3_test.dat') test_x = test_data[:,:-1] test_y = test_data[:,-1] #-------------画测试背景图------------ x_min, x_max = test_x[:,0].min()-0.1, test_x[:,0].max()+0.1 y_min, y_max = test_x[:,1].min()-0.1, test_x[:,1].max()+0.1 plot_step = 0.02 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.subplot(1,2,2) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) #------------画测试点------------ color = 'gr' label = [-1, 1] for i in range(2): idx = np.where(test_y==label[i]) plt.scatter(test_x[idx, 0], test_x[idx, 1], c = color[i]) plt.title("test") plt.suptitle('Decision Tree') plt.show() #----------E_in,E_out计算-------- predict = clf.predict(train_x) err_in = sum(predict!=train_y)/len(train_y) predict = clf.predict(test_x) err_out = sum(predict!=test_y)/len(test_y)