决策树(iris_datasets)
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 28 15:35:48 2019
@author: Administrator
"""
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
PROJECT_ROOT_DIR =r'C:\Users\wuxian\Desktop\handson-ml-master\handson-ml-master\datasets\decision_trees'
def save_fig(fig_id,tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR,fig_id + ".png" )
print('Saving figure',fig_id)
if tight_layout:
plt.tight_layout() #紧凑显示图片
plt.savefig(path,format = 'png',dpi = 300)
iris = load_iris()
type(iris)
iris.keys()
print('keys of iris_dataset:\n{}'.format(iris.keys()))
#DESCR 键对应的值是数据集的简要说明。我们这里给出说明的开头部分(你可以自己查看
#其余的内容):
print(iris['DESCR']+'\n')
iris.target_names
iris.feature_names
iris.values()
type(iris.data)
iris.data.shape[0]
iris.data
iris.data[:5]
pd.DataFrame(iris.data)
iris.target.shape[0]
{n:v for n,v in zip(iris.target_names,np.bincount(iris.target))}
pd.DataFrame(iris.target).values
iris.target_names
X = iris.data[:,2:]
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_clf.fit(X,y)
#$ dot - Tpng iris_tree.dot -o iris_tree.png
#
#from sklearn.tree import export_graphviz
#export_graphviz(
# tree_clf,out_file=image_path('iris_tree.png'),
# feature_name=iris.feature_names[2:],
# class_names=iris.target_names,
# rounded=True,
# filled=True
# )
# =============================================================================
#
# =============================================================================
X_1 = iris.data[:,:]
y_1 = iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_1,y_1,test_size=0.2,random_state=42)
len(X_train)
type(X_train)
X_train.shape
## 利用iris_dataset.feature_names中的字符串对数据列进行标记
iris = pd.DataFrame(X_train,columns = iris.feature_names)
# 利用DataFrame创建散点图矩阵,按y_train着色
grr = pd.scatter_matrix(iris,c=y_train,figsize=(15,15),marker='o',hist_kwds={'bins':20},s=60,alpha=.8)
#构建第一个模型:k近邻算法
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#构建模型
knn.fit(X_train,y_train)
#做出预测
X_new = np.array([[5,2.9,1,0.2]])
prediction = knn.predict(X_new)
iris['target_names'][prediction]
#评估模型
y_pred = knn.predict(X_test)
print('Test set predictions:\n{}'.format(y_pred))
print('Test set score:\n{:.2f}'.format(np.mean(y_pred == y_test)))
print('Test set score:\n{:.2f}'.format(knn.score(X_test,y_test)))
# =============================================================================
#
# =============================================================================
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf,X,y,axes=[0,7.5,0,3],iris=True,legend=False,plot_training=True):
x1s = np.linspace(axes[0],axes[1],100)
x2s = np.linspace(axes[2],axes[3],100)
x1,x2 = np.meshgrid(x1s,x2s)
X_new = np.c_[x1.ravel(),x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1,x2,y_pred,cmap=custom_cmap)
if not iris:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1,x2,y_pred,cmap=custom_cmap2,alpha=0.8)
if plot_training:
plt.plot(X[:,0][y==0],X[:,1][y==0],'yo',label='Iris-Setosa')
plt.plot(X[:,0][y==1],X[:,1][y==1],'bs',label='Iris-Versicolor')
plt.plot(X[:,0][y==2],X[:,1][y==2],'g^',label='Iris-Virginica')
plt.axis(axes)
if iris:
plt.xlabel('Petal length',fontsize=14)
plt.ylabel('Petal width',fontsize=14)
else:
plt.xlabel(r'$x_1$',fontsize=18)
plt.ylabel(r'$x_2$',fontsize=18,rotation=0)
if legend:
plt.legend(loc='lower right',fontsize=14)
plt.figure(figsize=(8,4))
plot_decision_boundary(tree_clf,X,y)
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)
plt.text(1.4,1.0,'Depth=0',fontsize=15)
plt.text(3.2,1.8,'Depth=1',fontsize=13)
plt.text(4.05,0.5,'(Depth=2)',fontsize=11)
save_fig('decision_tree_decision_boundaries_plot')
plt.show()
##Predicting Classes and class probabilities
tree_clf.predict_proba([[5,1.5]])
tree_clf.predict([[5,1.5]])
#sensitivity to training set details
X[(X[:,1]==X[:,1][y==1].max())&(y==1)]
not_widest_versicolor = (X[:,1]!=1.8)|(y==2)
X_tweaked = X[not_widest_versicolor]
y_tweaked = y[not_widest_versicolor]
tree_clf_tweaked = DecisionTreeClassifier(max_depth=2,random_state=40)
tree_clf_tweaked.fit(X_tweaked,y_tweaked)
plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf_tweaked, X_tweaked, y_tweaked, legend=False)
plt.plot([0, 7.5], [0.8, 0.8], "k-", linewidth=2)
plt.plot([0, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.text(1.0, 0.9, "Depth=0", fontsize=15)
plt.text(1.0, 1.80, "Depth=1", fontsize=13)
save_fig("decision_tree_instability_plot")
plt.show()
from sklearn.datasets import make_moons
Xm,ym = make_moons(n_samples=100,noise=0.25,random_state=53)
deep_tree_clf1 = DecisionTreeClassifier(random_state=42)
deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4,random_state=42)
deep_tree_clf1.fit(Xm,ym)
deep_tree_clf2.fit(Xm,ym)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(deep_tree_clf1,Xm,ym,axes=[-1.5,2.5,-1,1.5],iris=False)
plt.title('No restrictions',fontsize=16)
plt.subplot(122)
plot_decision_boundary(deep_tree_clf2,Xm,ym,axes=[-1.5,2.5,-1,1.5],iris=False)
plt.title('min_samples_leaf={}'.format(deep_tree_clf2.min_samples_leaf),fontsize=14)
save_fig('min_sample_leaf_plot')
plt.show()
angle = np.pi/180*20
rotation_matrix = np.array([[np.cos(angle),-np.sin(angle)],[np.sin(angle),np.cos(angle)]])
Xr = X.dot(rotation_matrix)
tree_clf_r = DecisionTreeClassifier(random_state=42)
tree_clf_r.fit(Xr,y)
plt.figure(figsize=(8,3))
plot_decision_boundary(tree_clf_r,Xr,y,axes=[0.5,7.5,-1,1],iris=False)
plt.show()
np.random.seed(6)
Xs = np.random.rand(100,2)-0.5
ys = (Xs[:,0]>0).astype(np.float32)*32
angle = np.pi/4
rotation_matrix = np.array([[np.cos(angle),-np.sin(angle)],[np.sin(angle),np.cos(angle)]])
Xsr = Xs.dot(rotation_matrix)
tree_clf_s = DecisionTreeClassifier(random_state=42)
tree_clf_s.fit(Xs,ys)
tree_clf_sr = DecisionTreeClassifier(random_state=42)
tree_clf_sr.fit(Xsr,ys)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf_s,Xs,ys,axes=[-0.7,0.7,-0.7,0.7],iris=False)
plt.subplot(122)
plot_decision_boundary(tree_clf_sr,Xsr,ys,axes=[-0.7,0.7,-0.7,0.7],iris=False)
save_fig('sensitivity_to_rotation_plot')
plt.show()
#regression Tress
#Quadratic Training set + noise
np.random.seed(42)
m = 200
X = np.random.rand(m,1)
y = 4*(X-0.5)**2
y = y+np.random.randn(m,1)/10
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=2,random_state=42)
tree_reg.fit(X,y)
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(random_state=42,max_depth=2)
tree_reg2 = DecisionTreeRegressor(random_state=42,max_depth=3)
tree_reg1.fit(X,y)
tree_reg2.fit(X,y)
def plot_regression_predictions(tree_reg,X,y,axes=[0,1,-0.2,1],ylabel='$y$'):
x1 = np.linspace(axes[0],axes[1],500).reshape(-1,1)
# print(x1)
y_pred = tree_reg.predict(x1)
plt.axis(axes)
plt.xlabel('$x_1$',fontsize=18)
if ylabel:
plt.ylabel(ylabel,fontsize=18,rotation=0)
plt.plot(X,y,'b.')
plt.plot(x1,y_pred,'r.-',linewidth=2,label=r'$\hat{y}$')
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_regression_predictions(tree_reg1,X,y)
for split,style in((0.1973,'k-'),(0.0917,'k--'),(0.7718,'k--')):
plt.plot([split,split],[-0.2,1],style,linewidth=2)
plt.text(0.21,0.65,'Depth=0',fontsize=15)
plt.text(0.01,0.2,'Depth=1',fontsize=13)
plt.text(0.65,0.8,'Depth=1',fontsize=13)
plt.legend(loc='upper center',fontsize=18)
plt.title('max_depth=2',fontsize=14)
plt.subplot(122)
plot_regression_predictions(tree_reg2,X,y,ylabel=None)
for split,style in((0.1973,'k-'),(0.0917, "k--"), (0.7718, "k--")):
plt.plot([split,split],[-0.2,1],style,linewidth=2)
for split in (0.0458,0.1298,0.2873,0.9040):
plt.plot([split,split],[-0.2,1],'k:',linewidth=1)
plt.text(0.3,0.5,'Depth=2',fontsize=13)
plt.title('max_depth=3',fontsize=14)
save_fig('tree_regression_plot')
plt.show()
#export_graphviz(
# tree_reg1,
# out_file=image_path('regression_tree.dot'),
# feature_names=['x1'],
# rounded=True,
# filed=True
# )
tree_reg1 = DecisionTreeRegressor(random_state=42)
tree_reg2 = DecisionTreeRegressor(random_state=42,min_samples_leaf=10)
tree_reg1.fit(X,y)
tree_reg2.fit(X,y)
x1 = np.linspace(0,1,500).reshape(-1,1)
y_pred1 = tree_reg1.predict(x1)
y_pred2 = tree_reg2.predict(x1)
plt.figure(figsize=(11,4))
plt.subplot(121)
plt.plot(X,y,'b.')
plt.plot(x1,y_pred1,'r.-',linewidth=2,label=r'$\hat{y}$')
plt.axis([0,1,-0.2,1.1])
plt.xlabel('$x_1$',fontsize=18)
plt.ylabel('$y$',fontsize=18,rotation=0)
plt.legend(loc='upper center',fontsize=18)
plt.title('No restrictions',fontsize=14)
plt.subplot(122)
plt.plot(X,y,'b.')
plt.plot(x1,y_pred2,'r.-',linewidth=2,label=r'$\hat{y}$')
plt.axis([0,1,-0.2,1.1])
plt.xlabel('$x_1$',fontsize=18)
plt.title('min_sample_leaf={}'.format(tree_reg2.min_samples_leaf),fontsize=14)
save_fig('tree_regression_regularization_plot')
plt.show()
#练习第七题train and fine-tune a Decision Tree for the moons dataset.
from sklearn.datasets import make_moons
X,y = make_moons(n_samples=10000,noise=0.4,random_state=42)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
from sklearn.model_selection import GridSearchCV
params = {'max_leaf_nodes':list(range(2,100)),'min_samples_split':[2,3,4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=-1,verbose=1)
grid_search_cv.fit(X_train,y_train)
grid_search_cv.best_estimator_
from sklearn.metrics import accuracy_score
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)
#练习第8题Grow a forest
from sklearn.model_selection import ShuffleSplit
n_trees = 1000
n_instances = 100
mini_sets = []
rs =ShuffleSplit(n_splits=n_trees,test_size=len(X_train)-n_instances,random_state=42)
for mini_train_index,mini_test_index in rs.split(X_train):
X_mini_train = X_train[mini_train_index]
y_mini_train = y_train[mini_train_index]
mini_sets.append((X_mini_train,y_mini_train))
from sklearn.base import clone
forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []
for tree, (X_mini_train,y_mini_train) in zip(forest,mini_sets):
tree.fit(X_mini_train,y_mini_train)
y_pred = tree.predict(X_test)
accuracy_scores.append(accuracy_score(y_test,y_pred))
np.mean(accuracy_scores)
Y_pred = np.empty([n_trees,len(X_test)],dtype=np.uint8)
for tree_index,tree in enumerate(forest):
Y_pred[tree_index] = tree.predict(X_test)
from scipy.stats import mode
y_pred_majority_votes, n_votes = mode(Y_pred,axis=0)
accuracy_score(y_test,y_pred_majority_votes.reshape([-1]))