【机器学习实战】6_decision_trees代码《Hands-On Machine Learning with Scikit-Learn&TensorFlow》

决策树(iris_datasets)

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 28 15:35:48 2019

@author: Administrator
"""

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt 
import os
import pandas as pd
PROJECT_ROOT_DIR =r'C:\Users\wuxian\Desktop\handson-ml-master\handson-ml-master\datasets\decision_trees'

def save_fig(fig_id,tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR,fig_id + ".png" )
    print('Saving figure',fig_id)
    if tight_layout:
        plt.tight_layout() #紧凑显示图片
    plt.savefig(path,format = 'png',dpi = 300)

iris = load_iris()
type(iris)
iris.keys()
print('keys of iris_dataset:\n{}'.format(iris.keys()))
#DESCR 键对应的值是数据集的简要说明。我们这里给出说明的开头部分(你可以自己查看
#其余的内容):
print(iris['DESCR']+'\n')
iris.target_names
iris.feature_names
iris.values()
type(iris.data)
iris.data.shape[0]
iris.data
iris.data[:5]
pd.DataFrame(iris.data)
iris.target.shape[0]
{n:v for n,v in zip(iris.target_names,np.bincount(iris.target))}

pd.DataFrame(iris.target).values
iris.target_names

X = iris.data[:,2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_clf.fit(X,y)

#$ dot - Tpng iris_tree.dot -o iris_tree.png
#
#from sklearn.tree import export_graphviz
#export_graphviz(
#        tree_clf,out_file=image_path('iris_tree.png'),
#        feature_name=iris.feature_names[2:],
#        class_names=iris.target_names,
#        rounded=True,
#        filled=True        
#        )
# =============================================================================
# 
# =============================================================================
X_1 = iris.data[:,:]
y_1 = iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_1,y_1,test_size=0.2,random_state=42)
len(X_train)
type(X_train)
X_train.shape
## 利用iris_dataset.feature_names中的字符串对数据列进行标记
iris = pd.DataFrame(X_train,columns = iris.feature_names)
# 利用DataFrame创建散点图矩阵,按y_train着色
grr = pd.scatter_matrix(iris,c=y_train,figsize=(15,15),marker='o',hist_kwds={'bins':20},s=60,alpha=.8)

#构建第一个模型:k近邻算法
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#构建模型
knn.fit(X_train,y_train)
#做出预测
X_new = np.array([[5,2.9,1,0.2]])
prediction = knn.predict(X_new)
iris['target_names'][prediction]
#评估模型
y_pred = knn.predict(X_test)
print('Test set predictions:\n{}'.format(y_pred))
print('Test set score:\n{:.2f}'.format(np.mean(y_pred == y_test)))
print('Test set score:\n{:.2f}'.format(knn.score(X_test,y_test)))

# =============================================================================
# 
# =============================================================================

from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf,X,y,axes=[0,7.5,0,3],iris=True,legend=False,plot_training=True):
    x1s = np.linspace(axes[0],axes[1],100)
    x2s = np.linspace(axes[2],axes[3],100)
    x1,x2 = np.meshgrid(x1s,x2s)
    X_new = np.c_[x1.ravel(),x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1,x2,y_pred,cmap=custom_cmap) 
    if not iris:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1,x2,y_pred,cmap=custom_cmap2,alpha=0.8)
    if plot_training:
        plt.plot(X[:,0][y==0],X[:,1][y==0],'yo',label='Iris-Setosa')
        plt.plot(X[:,0][y==1],X[:,1][y==1],'bs',label='Iris-Versicolor')
        plt.plot(X[:,0][y==2],X[:,1][y==2],'g^',label='Iris-Virginica')
        plt.axis(axes)
        
    if iris:
        plt.xlabel('Petal length',fontsize=14)
        plt.ylabel('Petal width',fontsize=14)
        
    else:
        plt.xlabel(r'$x_1$',fontsize=18)
        plt.ylabel(r'$x_2$',fontsize=18,rotation=0)
        
    if legend:
        plt.legend(loc='lower right',fontsize=14)
        
plt.figure(figsize=(8,4))
plot_decision_boundary(tree_clf,X,y)
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)      
plt.text(1.4,1.0,'Depth=0',fontsize=15)    
plt.text(3.2,1.8,'Depth=1',fontsize=13)
plt.text(4.05,0.5,'(Depth=2)',fontsize=11)  
                          
save_fig('decision_tree_decision_boundaries_plot')
plt.show()
       
##Predicting Classes and class probabilities
tree_clf.predict_proba([[5,1.5]])    

tree_clf.predict([[5,1.5]])    

#sensitivity to training set details
X[(X[:,1]==X[:,1][y==1].max())&(y==1)]  

not_widest_versicolor = (X[:,1]!=1.8)|(y==2)   
X_tweaked = X[not_widest_versicolor] 
y_tweaked = y[not_widest_versicolor]  

tree_clf_tweaked = DecisionTreeClassifier(max_depth=2,random_state=40)   
tree_clf_tweaked.fit(X_tweaked,y_tweaked)      

plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf_tweaked, X_tweaked, y_tweaked, legend=False)
plt.plot([0, 7.5], [0.8, 0.8], "k-", linewidth=2)
plt.plot([0, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.text(1.0, 0.9, "Depth=0", fontsize=15)
plt.text(1.0, 1.80, "Depth=1", fontsize=13)

save_fig("decision_tree_instability_plot")
plt.show()

from sklearn.datasets import make_moons
Xm,ym = make_moons(n_samples=100,noise=0.25,random_state=53)

deep_tree_clf1 = DecisionTreeClassifier(random_state=42)
deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4,random_state=42)
deep_tree_clf1.fit(Xm,ym)
deep_tree_clf2.fit(Xm,ym)

plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(deep_tree_clf1,Xm,ym,axes=[-1.5,2.5,-1,1.5],iris=False)
plt.title('No restrictions',fontsize=16)
plt.subplot(122)
plot_decision_boundary(deep_tree_clf2,Xm,ym,axes=[-1.5,2.5,-1,1.5],iris=False)
plt.title('min_samples_leaf={}'.format(deep_tree_clf2.min_samples_leaf),fontsize=14)

save_fig('min_sample_leaf_plot')
plt.show()

angle = np.pi/180*20
rotation_matrix = np.array([[np.cos(angle),-np.sin(angle)],[np.sin(angle),np.cos(angle)]])
Xr = X.dot(rotation_matrix)

tree_clf_r = DecisionTreeClassifier(random_state=42)
tree_clf_r.fit(Xr,y)

plt.figure(figsize=(8,3))
plot_decision_boundary(tree_clf_r,Xr,y,axes=[0.5,7.5,-1,1],iris=False)
plt.show()

np.random.seed(6)
Xs = np.random.rand(100,2)-0.5
ys = (Xs[:,0]>0).astype(np.float32)*32

angle = np.pi/4
rotation_matrix = np.array([[np.cos(angle),-np.sin(angle)],[np.sin(angle),np.cos(angle)]])
Xsr = Xs.dot(rotation_matrix)

tree_clf_s = DecisionTreeClassifier(random_state=42)
tree_clf_s.fit(Xs,ys)
tree_clf_sr = DecisionTreeClassifier(random_state=42)
tree_clf_sr.fit(Xsr,ys)

plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf_s,Xs,ys,axes=[-0.7,0.7,-0.7,0.7],iris=False)
plt.subplot(122)
plot_decision_boundary(tree_clf_sr,Xsr,ys,axes=[-0.7,0.7,-0.7,0.7],iris=False)

save_fig('sensitivity_to_rotation_plot')
plt.show()

#regression Tress
#Quadratic Training set + noise
np.random.seed(42)
m = 200
X = np.random.rand(m,1)
y = 4*(X-0.5)**2
y = y+np.random.randn(m,1)/10
 
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=2,random_state=42)
tree_reg.fit(X,y)

from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(random_state=42,max_depth=2)
tree_reg2 = DecisionTreeRegressor(random_state=42,max_depth=3)
tree_reg1.fit(X,y)
tree_reg2.fit(X,y)

def plot_regression_predictions(tree_reg,X,y,axes=[0,1,-0.2,1],ylabel='$y$'):
    x1 = np.linspace(axes[0],axes[1],500).reshape(-1,1)
#    print(x1)
    y_pred = tree_reg.predict(x1)
    plt.axis(axes)
    plt.xlabel('$x_1$',fontsize=18)
    if ylabel:
        plt.ylabel(ylabel,fontsize=18,rotation=0)
    plt.plot(X,y,'b.')
    plt.plot(x1,y_pred,'r.-',linewidth=2,label=r'$\hat{y}$')

plt.figure(figsize=(11,4))
plt.subplot(121)
plot_regression_predictions(tree_reg1,X,y)
for split,style in((0.1973,'k-'),(0.0917,'k--'),(0.7718,'k--')):
    plt.plot([split,split],[-0.2,1],style,linewidth=2)
plt.text(0.21,0.65,'Depth=0',fontsize=15)
plt.text(0.01,0.2,'Depth=1',fontsize=13)
plt.text(0.65,0.8,'Depth=1',fontsize=13)
plt.legend(loc='upper center',fontsize=18)
plt.title('max_depth=2',fontsize=14)

plt.subplot(122)
plot_regression_predictions(tree_reg2,X,y,ylabel=None)
for split,style in((0.1973,'k-'),(0.0917, "k--"), (0.7718, "k--")):
    plt.plot([split,split],[-0.2,1],style,linewidth=2)
for split in (0.0458,0.1298,0.2873,0.9040):
    plt.plot([split,split],[-0.2,1],'k:',linewidth=1)
plt.text(0.3,0.5,'Depth=2',fontsize=13)
plt.title('max_depth=3',fontsize=14)

save_fig('tree_regression_plot')
plt.show()

#export_graphviz(
#        tree_reg1,
#        out_file=image_path('regression_tree.dot'),
#        feature_names=['x1'],
#        rounded=True,
#        filed=True
#        )

tree_reg1 = DecisionTreeRegressor(random_state=42)
tree_reg2 = DecisionTreeRegressor(random_state=42,min_samples_leaf=10)
tree_reg1.fit(X,y)
tree_reg2.fit(X,y)

x1 = np.linspace(0,1,500).reshape(-1,1)
y_pred1 = tree_reg1.predict(x1)
y_pred2 = tree_reg2.predict(x1) 

plt.figure(figsize=(11,4))

plt.subplot(121)
plt.plot(X,y,'b.')
plt.plot(x1,y_pred1,'r.-',linewidth=2,label=r'$\hat{y}$')
plt.axis([0,1,-0.2,1.1])
plt.xlabel('$x_1$',fontsize=18)
plt.ylabel('$y$',fontsize=18,rotation=0)
plt.legend(loc='upper center',fontsize=18)
plt.title('No restrictions',fontsize=14)

plt.subplot(122)
plt.plot(X,y,'b.')
plt.plot(x1,y_pred2,'r.-',linewidth=2,label=r'$\hat{y}$')
plt.axis([0,1,-0.2,1.1])
plt.xlabel('$x_1$',fontsize=18)
plt.title('min_sample_leaf={}'.format(tree_reg2.min_samples_leaf),fontsize=14)

save_fig('tree_regression_regularization_plot')
plt.show()

#练习第七题train and fine-tune a Decision Tree for the moons dataset.
from sklearn.datasets import make_moons
X,y = make_moons(n_samples=10000,noise=0.4,random_state=42)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

from sklearn.model_selection import GridSearchCV
params = {'max_leaf_nodes':list(range(2,100)),'min_samples_split':[2,3,4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=-1,verbose=1)

grid_search_cv.fit(X_train,y_train)

grid_search_cv.best_estimator_

from sklearn.metrics import accuracy_score
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)

#练习第8题Grow a forest
from sklearn.model_selection import ShuffleSplit
n_trees = 1000
n_instances = 100

mini_sets = []

rs =ShuffleSplit(n_splits=n_trees,test_size=len(X_train)-n_instances,random_state=42)

for mini_train_index,mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train,y_mini_train))

from sklearn.base import clone
forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []
for tree, (X_mini_train,y_mini_train) in zip(forest,mini_sets):
    tree.fit(X_mini_train,y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test,y_pred))
    
np.mean(accuracy_scores)

Y_pred = np.empty([n_trees,len(X_test)],dtype=np.uint8)

for tree_index,tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)
    
from scipy.stats import mode
y_pred_majority_votes, n_votes = mode(Y_pred,axis=0)

accuracy_score(y_test,y_pred_majority_votes.reshape([-1]))

你可能感兴趣的:(python)