method:
decision_path:返回决策树的路径
# decision_tree
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris,fetch_20newsgroups,load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
def decision():
# 获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# 处理数据
x = titan[['pclass','age','sex']]
y = titan['survived']
# 缺失值处理
x['age'].fillna(x['age'].mean(),inplace = True)
# 分割数据集到训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25)
# 进行处理(特征工程)特征-》类别->one-hot编码
dict = DictVectorizer(sparse = False)
x_train = dict.fit_transform(x_train.to_dict(orient = 'records'))
x_test = dict.fit_transform(x_test.to_dict(orient = 'records'))
# 决策树
dec = DecisionTreeClassifier(max_depth=10) # 会影响树的形状
dec.fit(x_train,y_train)
print(dec.score(x_test,y_test))
# 导出决策树的结构
export_graphviz(dec,out_file='./tree.dot',feature_names=['1','2','3','4','5','6'])
return None
if __name__ == "__main__":
decision()
# 导出决策树的结构
export_graphviz(dec,out_file='./tree.dot',feature_names=['1','2','3','4','5','6'])
# decision_tree
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris,fetch_20newsgroups,load_boston
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier
def decision():
# 获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# 处理数据
x = titan[['pclass','age','sex']]
y = titan['survived']
# 缺失值处理
x['age'].fillna(x['age'].mean(),inplace = True)
# 分割数据集到训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25)
# 进行处理(特征工程)特征-》类别->one-hot编码
dict = DictVectorizer(sparse = False)
x_train = dict.fit_transform(x_train.to_dict(orient = 'records'))
x_test = dict.fit_transform(x_test.to_dict(orient = 'records'))
# 随机森林
rf = RandomForestClassifier()
param = {'n_estimators':[120,200,300,500,800,1200],'max_depth':[5,8,15,25,30]}
# 网格搜索与交叉验证
gc = GridSearchCV(rf,param_grid=param,cv = 2)
gc.fit(x_train,y_train)
print('准确率',gc.score(x_test,y_test))
print('查看选择模型',gc.best_params_)
return None
if __name__ == "__main__":
decision()