回归树
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
rng=np.random.RandomState(1)
x=np.sort(5*rng.rand(80,1),axis=0)
y=np.sin(x).ravel()
y[::5]+=3*(0.5-rng.rand(16))
regr_1=DecisionTreeRegressor(max_depth=2)
regr_2=DecisionTreeRegressor(max_depth=5)
regr_1.fit(x,y)
regr_2.fit(x,y)
x_test=np.arange(0.0,5.0,0.01)[:,np.newaxis]
y_1=regr_1.predict(x_test)
y_2=regr_2.predict(x_test)
plt.figure()
plt.scatter(x,y,s=20,edgecolors="black",c="darkorange",label="data")
plt.plot(x_test,y_1,color="cornflowerblue",label="max_depth=2",linewidth=2)
plt.plot(x_test,y_2,color="yellowgreen",label="max_depth=5",linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()
```python
决策树实例
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
data=pd.read_csv(r"train.csv")
data.drop(['Cabin','Name','Ticket'],inplace=True,axis=1)
data['Age']=data['Age'].fillna(data['Age'].mean())
data=data.dropna()
print(data['Embarked'].unique().tolist())
labels=data['Embarked'].unique().tolist()
data['Embarked']=data['Embarked'].apply(lambda x:labels.index(x))
data['Sex']=(data['Sex']=='male').astype('int')
x=data.iloc[:,data.columns!='Survived']
y=data.iloc[:,data.columns =='Survived']
xtrain,xtest,Ytrain,Ytest=train_test_split(x,y,test_size=0.3)
for i in xtrain,xtest,Ytrain,Ytest:
i.index=range(i.shape[0])
clf=DecisionTreeClassifier(random_state=25)
clf=clf.fit(xtrain,Ytrain)
score=clf.score(xtest,Ytest)
clf=DecisionTreeClassifier(random_state=25)
score1=cross_val_score(clf,xtest,Ytest,cv=10).mean()
train=[]
test=[]
for i in range(10):
clf=DecisionTreeClassifier(random_state=25
,max_depth=i+1
,criterion="entropy"
)
clf=clf.fit(xtrain,Ytrain)
score_train=clf.score(xtrain,Ytrain)
score_test=cross_val_score(clf,xtest,Ytest,cv=10).mean()
train.append(score_train)
test.append(score_test)
print(max(test))
plt.plot(range(1,11),train,color="red",label="train")
plt.plot(range(1,11),test,color="blue",label="test")
plt.xticks(range(1,11))
plt.legend()
plt.show()
import numpy as np
gini_threholds=np.linspace(0,0.5,50)
parameters={"criterion":("gini","entropy")
,"splitter":("best","random")
,"max_depth":[*range(1,10)]
,"min_samples_leaf":[*range(1,50,5)]
,"min_impurity_decrease":[*np.linspace(0,0.5,50)]
}
clf=DecisionTreeClassifier(random_state=25)
GS=GridSearchCV(clf,parameters,cv=10)
GS=GS.fit(xtrain,Ytrain)
print(GS.best_params_)
print(GS.best_score_)
泰坦尼克号数据集:私聊发送