sklearn decision-tree实验

《机器学习技法》作业需要用到决策树。一直以为要用matlab自己实现一个,但今早起床的时候突然想起,不是有个叫sklearn的东西,是否可以直接拿来用呢?

下午做了一些实验,作业里提供的数据,参考sklearn网址提供的代码:

import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt

data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]

clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_x, train_y)

#获取最大最小值,可以用作归一化,也可以用作画图范围(meshgrid)
x_min, x_max = train_x[:,0].min()-0.1, train_x[:,0].max()+0.1
y_min, y_max = train_x[:,1].min()-0.1, train_x[:,1].max()+0.1
plot_step = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
plt.plot()

#xx.ravel(),flatten the array. np.c_[],配对
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

#画点
color = 'gr'
label = [-1, 1]
for i in range(2):
    idx = np.where(train_y==label[i])
    plt.scatter(train_x[idx, 0], train_x[idx, 1], c = color[i])

plt.axis("tight")
plt.suptitle("Decision surface of a decision tree using")
plt.show()
自动训练,只需要了解参数设置。可视化结果:

sklearn decision-tree实验_第1张图片

还可以把树用.dot格式存储起来(.txt存起来内容也是一样的),图形的文字型表示,如下图:

sklearn decision-tree实验_第2张图片

还可以用pydot图形化(关于pypot的安装见上一片博客),并存储为pdf格式:

from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('ex.pdf')
sklearn decision-tree实验_第3张图片

此外,还可以保存为图片:

open('a.jpg','wb').write(graph.create_jpg())

加入测试数据,对比观察:

sklearn decision-tree实验_第4张图片

类似于matlab画子图plt.subplot(1,2,1)。


作业代码:

from __future__ import division 
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt
 
data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]

#训练树
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_x, train_y)

#-----------画背景分割图-----------
#获取最大最小值,可以用作归一化,也可以用作画图范围(meshgrid)
x_min, x_max = train_x[:,0].min()-0.1, train_x[:,0].max()+0.1
y_min, y_max = train_x[:,1].min()-0.1, train_x[:,1].max()+0.1
plot_step = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
plt.subplot(1,2,1)

#xx.ravel(),flatten the array. np.c_[],配对
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

#-------------画训练点----------------------
color = 'gr'
label = [-1, 1]
for i in range(2):
    idx = np.where(train_y==label[i])
    plt.scatter(train_x[idx, 0], train_x[idx, 1], c = color[i])
plt.title('train')

#测试
test_data = np.loadtxt('hw3_test.dat')
test_x = test_data[:,:-1]
test_y = test_data[:,-1]

#-------------画测试背景图------------
x_min, x_max = test_x[:,0].min()-0.1, test_x[:,0].max()+0.1
y_min, y_max = test_x[:,1].min()-0.1, test_x[:,1].max()+0.1
plot_step = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
plt.subplot(1,2,2)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

#------------画测试点------------
color = 'gr'
label = [-1, 1]
for i in range(2):
    idx = np.where(test_y==label[i])
    plt.scatter(test_x[idx, 0], test_x[idx, 1], c = color[i])
plt.title("test")
plt.suptitle('Decision Tree')
plt.show()


#----------E_in,E_out计算--------
predict = clf.predict(train_x)
err_in = sum(predict!=train_y)/len(train_y)
predict = clf.predict(test_x)
err_out = sum(predict!=test_y)/len(test_y)

ok.









你可能感兴趣的:(sklearn decision-tree实验)