泰坦尼克号(决策树)

# encoding: utf-8
# 导入模块
import pandas as pd
import graphviz
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer


#获取数据
path = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt'
titanic = pd.read_csv(path)


#获取特征值与目标值
x = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']


#处理数据
x['age'].fillna(x['age'].mean(), inplace=True)
# 转换成字典
x = x.to_dict(orient='recorda')
# 划分数据
x_train,x_test,y_train,y_test = train_test_split(x,y)


#字典特征提取
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)


#构建模型
model = DecisionTreeClassifier(criterion='entropy')
model.fit(x_train,y_train)
y_predict = model.predict(x_test)
print('预测值与真实值比对:\n',y_predict==y_test)
#求准确率
score = model.score(x_test,y_test)
print('准确率:\n',score)
#可视化决策树
image = export_graphviz(
    model,
    out_file=None,
    feature_names=transfer.get_feature_names(),
    filled=True,
    node_ids=True,
    rounded=True)
graphviz.Source(image)
d:\env_py36\py36\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


预测值与真实值比对:
 914     False
369      True
675      True
1107     True
181     False
        ...  
706      True
718      True
1220     True
553      True
552      True
Name: survived, Length: 329, dtype: bool
准确率:
 0.7811550151975684

你可能感兴趣的:(泰坦尼克号(决策树))