#创建一个含有噪声的正弦曲线
rng = np.random.RandomState(1)#设置随机数种子
x = np.sort(5*rng.rand(80,1),axis=0)#构造0~5之间的随机数并排序
y = np.sin(x).ravel()#生成正弦曲线值
y[::5]+=3*(0.5- rng.rand(16))#添加噪声数据
#实例化训练模型
regr1 = DecisionTreeRegressor(max_depth=2)
regr2 = DecisionTreeRegressor(max_depth=5)
regr1 = regr1.fit(x,y)
regr2 = regr2.fit(x,y)
#测试集导入模型,预测结果
xtest = np.arange(0,5,0.01)[:,np.newaxis]
y_1 = regr1.predict(xtest)
y_2 = regr2.predict(xtest)
#作图
plt.figure()
plt.scatter(x,y,s=20,edgecolors='black',c='darkorange',label ='data')
plt.plot(xtest,y_1,color='cornflowerblue',label='max_depth=2',linewidth=2)
plt.plot(xtest,y_2,color='yellowgreen',label='max_depth=5',linewidth=2)
plt.legend()
plt.title("DecisionTreeRegressor")
plt.show()
1.导入所需要使用的包,获取数据源(kaggle上面下载)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
data = pd.read_csv(r".\titanic\train.csv")
data.info()
3.处理数据
#删除冗余字段
data_new = data.drop(["Name","Ticket","Cabin"],axis=1)
#axis=1,轴向=1即对列进行操作
#使用年龄字段的均值填补缺失值
data_new["Age"] = data_new["Age"].fillna(data_new["Age"].mean())
#将embarked字段中含有缺失值的行删除
data_new.dropna(axis=0)
data_new.info()
#将sex、embarked字段转换为字段属性,可有两种不同的方法
labels = data_new["Embarked"].unique().tolist()
data_new["Embarked"] = data_new["Embarked"].apply(lambda x:labels.index(x))
data_new["Sex"] = (data_new["Sex"]=="male").astype("int")
#至此数据的基本处理已经结束
4.使用布尔索引将数据特征和结果分离,并划分测试集和训练集
x = data_new.iloc[:,data_new.columns!="Survived"]
y = data_new.iloc[:,data_new.columns=="Survived"]
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
for i in [xtrain,xtest,ytrain,ytest]:
i.index = range(i.shape[0])
clf = DecisionTreeClassifier(random_state=25)
clf = clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)
score
在不改变任何参数的情况下,预测的得分为0.7469611848825333,结果是不够理性的,因此,使用交叉验证来进一步验证结果
clf = DecisionTreeClassifier(random_state=25)
clf = clf.fit(xtrain,ytrain)
score_mean = cross_val_score(clf,x,y,cv=10).mean()
score_mean
此时,score_mean的得分仅为0.7469611848825333,因此需要调整参数,首先想到的是使用max_depth参数,绘制超参数曲线,找到最为合适的取值
#交叉验证的结果比单个的结果更低,因此要来调整参数,首先想到的是max_depth,因此绘制超参数曲线
score_test=[]
score_train=[]
for i in range(10):
clf = DecisionTreeClassifier(random_state=25
,max_depth=i+1)
clf = clf.fit(xtrain,ytrain)
score_tr = clf.score(xtrain,ytrain)
score_te = cross_val_score(clf,x,y,cv=10).mean()
score_train.append(score_tr)
score_test.append(score_te)
print(max(score_test))
#绘制超参数图像
plt.plot(range(1,11),score_train,color="red",label="train")
plt.plot(range(1,11),score_test,color="blue",label="test")
plt.legend()
plt.xticks(range(1,11))
plt.show()
根据函数图像可以看到,当max_depth>3之后,训练集上的得分远远高于测试集上的得分,存在过拟合的现象,因此max_depth=3为较为合适的取值
#调整参数criterion,观察图像变化
score_test =[]
score_train = []
for i in range(10):
clf = DecisionTreeClassifier(random_state=25
,max_depth=i+1
,criterion="entropy")
clf = clf.fit(xtrain,ytrain)
score_tr = clf.score(xtrain,ytrain)
score_te = cross_val_score(clf,x,y,cv=10).mean()
score_train.append(score_tr)
score_test.append(score_te)
print(max(score_test))
#绘制图像
plt.plot(range(1,11),score_train,color="red",label="train")
plt.plot(range(1,11),score_test,color="blue",label="test")
plt.xticks(range(1,11))
plt.legend()
plt.show()
由图像观察可知,criterion="entropy"使得max_depth=3时两点更加接近,并且得分有所提高
6.使用网格搜索,找个各个参数合适的值
#parameters:本质是一串参数和这串参数对应的,我们希望网格搜索来搜索的参数的取值范围
parameters = {
"splitter":("best","random")
,"min_samples_leaf":[*range(1,20,5)]
,"min_impurity_decrease":[*np.linspace(0,0.5,20)]
}
clf = DecisionTreeClassifier(random_state=25
,max_depth=3
,criterion="entropy")
GS = GridSearchCV(clf,parameters,cv=10)
GS = GS.fit(xtrain,ytrain)
GS.best_score_
GS.best_params_
此时最好的得分结果已经调整为0.824,该得分是目前使用分类树所取得最好的得分了,在后续学习中将会使用其他算法来尝试解决这个问题。