树、决策森林案例:泰坦尼克号乘客生存预测

一、数据截图

树、决策森林案例:泰坦尼克号乘客生存预测_第1张图片

二、 代码

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz


def titanic():
#1.获取数据
    path="train.csv"
    titanic=pd.read_csv(path)#,nrows=100)
    #print(titanic)
    #已知数据有下列特征值
    #PassengerId	Survived	Pclass	Name	Sex	Age	  SibSp	 Parch	Ticket	Fare	Cabin	Embarked
    #乘客id          是否生存      阶级   姓名    性别 年龄 有兄弟姐妹 口渴    票    票价      隔间    上船位置
    #方法1-缺失值处理-删去
    #titanic.dropna(axis=0,how="any",inplace=True)
    # 分析选取特征值
    x = titanic[["Pclass", "Sex", "Age"]]
    # 目标值
    y = titanic["Survived"]
    #print(x)
#2.数据处理
    #0)检查是否存在缺失值(可先将所有数据进行判别
    #x_Pclass_exist_nan=x["Pclass"].isnull().values.any()
    #x_Sex_exist_nan=x["Sex"].isnull().values.any()
    #x_Age_exist_nan=x["Age"].isnull().values.any()
    #y_exist_nan=y.isnull().values.any()
    # print("特征值Pclass是否有nan:",x_Pclass_exist_nan,
    #       "\n特征值Sex是否有nan:",x_Sex_exist_nan,
    #       "\n特征值Age是否有nan:",x_Age_exist_nan)
    # print("目标值是否有nan:",y_exist_nan)
    #方法2-缺失值处理-平均值
    x["Age"].fillna(x["Age"].mean(),inplace=True)
    #print(x)
    #转换成字典
    x=x.to_dict(orient="records")
#3划分数据集
    x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=22)
#4字典特征抽取
    transfer = DictVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
#5决策树预估器

    # 加入网格搜索和交叉验证
    estimator=RandomForestClassifier()

    param_dict = {"n_estimators": [120, 200, 300, 500, 800, 1200],
                  "max_depth":[5,8,15,25,30]}
    estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
    estimator.fit(x_train, y_train)


#6模型评估
    # 1.对比真实值和预测值
    y_predict=estimator.predict(x_test)
    print("真实值和预测值比对\n",y_test==y_predict)
    # 2.计算准确率
    score=estimator.score(x_test,y_test)
    print("准确率为\n",score)#整体测试集的效果

    #网格搜索与交叉验证结果的参数
    print("最佳参数\n",estimator.best_params_)
    print("最佳结果\n",estimator.best_score_)#验证集中的结果
    print("最佳估计器\n",estimator.best_estimator_)
    print("交叉验证结果\n",estimator.cv_results_)

#7可视化决策树
    export_graphviz(estimator, out_file="./titanic.tree.dot", feature_names=transfer.get_feature_names())


if __name__ == '__main__':
    titanic()

三、运行结果

真实值和预测值比对
 816    False
789     True
869    False
235    False
473     True
       ...  
174     True
723     True
350     True
399     True
194     True
Name: Survived, Length: 223, dtype: bool
准确率为
 0.7713004484304933
最佳参数
 {'max_depth': 5, 'n_estimators': 120}
最佳结果
 0.8159011028966185
最佳估计器
 RandomForestClassifier(max_depth=5, n_estimators=120)
交叉验证结果
 {'mean_fit_time': array([0.13299497, 0.21445537, 0.34839487, 0.54510999, 0.88557641,
       1.42996311, 0.14926664, 0.24255347, 0.38314478, 0.60953983,
       1.00982849, 1.51456213, 0.15878431, 0.26461275, 0.39483476,
       0.66195901, 1.05658213, 1.58990987, 0.16172369, 0.26080561,
       0.39532431, 0.66376098, 1.06954074, 1.62762157, 0.16098619,
       0.26103052, 0.3728656 , 0.65319649, 1.0090692 , 1.5897665 ]), 'std_fit_time': array([0.0020517 , 0.00352949, 0.04416501, 0.02231389, 0.03516453,
       0.08532633, 0.00261708, 0.00313477, 0.0252167 , 0.01115087,
       0.01610297, 0.02811479, 0.00479312, 0.00383677, 0.00866105,
       0.01563949, 0.0052264 , 0.00333935, 0.00379794, 0.00522534,
       0.01015787, 0.01360942, 0.02900932, 0.05176689, 0.00399215,
       0.01144514, 0.0095812 , 0.0154222 , 0.0168415 , 0.07855189]), 'mean_score_time': array([0.01129794, 0.01760912, 0.0269297 , 0.04188768, 0.06783303,
       0.12831823, 0.01164651, 0.01913222, 0.02795378, 0.0453968 ,
       0.07681274, 0.11271127, 0.01163594, 0.0202926 , 0.02926898,
       0.04853662, 0.078336  , 0.11386808, 0.01196702, 0.02012944,
       0.02891843, 0.04953392, 0.07670363, 0.12533696, 0.01163022,
       0.01925429, 0.02759449, 0.04856451, 0.07310772, 0.1094408 ]), 'std_score_time': array([1.21565231e-03, 4.64426949e-04, 1.40815497e-03, 8.77806426e-07,
       8.14279704e-04, 2.85228987e-02, 4.79448361e-04, 2.90959658e-04,
       7.97307765e-04, 6.99545003e-04, 8.38448756e-04, 1.41999678e-03,
       9.41111034e-04, 1.68392675e-03, 1.67678405e-03, 1.88199732e-03,
       1.20660599e-03, 2.76808664e-03, 8.13614765e-04, 2.29377756e-04,
       8.14112007e-04, 3.08379231e-03, 1.06598089e-03, 9.68575782e-03,
       4.79575098e-04, 4.60008011e-04, 1.24437035e-03, 1.69949754e-03,
       2.60600291e-03, 2.12582777e-03]), 'param_max_depth': masked_array(data=[5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15,
                   15, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[120, 200, 300, 500, 800, 1200, 120, 200, 300, 500, 800,
                   1200, 120, 200, 300, 500, 800, 1200, 120, 200, 300,
                   500, 800, 1200, 120, 200, 300, 500, 800, 1200],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 5, 'n_estimators': 120}, {'max_depth': 5, 'n_estimators': 200}, {'max_depth': 5, 'n_estimators': 300}, {'max_depth': 5, 'n_estimators': 500}, {'max_depth': 5, 'n_estimators': 800}, {'max_depth': 5, 'n_estimators': 1200}, {'max_depth': 8, 'n_estimators': 120}, {'max_depth': 8, 'n_estimators': 200}, {'max_depth': 8, 'n_estimators': 300}, {'max_depth': 8, 'n_estimators': 500}, {'max_depth': 8, 'n_estimators': 800}, {'max_depth': 8, 'n_estimators': 1200}, {'max_depth': 15, 'n_estimators': 120}, {'max_depth': 15, 'n_estimators': 200}, {'max_depth': 15, 'n_estimators': 300}, {'max_depth': 15, 'n_estimators': 500}, {'max_depth': 15, 'n_estimators': 800}, {'max_depth': 15, 'n_estimators': 1200}, {'max_depth': 25, 'n_estimators': 120}, {'max_depth': 25, 'n_estimators': 200}, {'max_depth': 25, 'n_estimators': 300}, {'max_depth': 25, 'n_estimators': 500}, {'max_depth': 25, 'n_estimators': 800}, {'max_depth': 25, 'n_estimators': 1200}, {'max_depth': 30, 'n_estimators': 120}, {'max_depth': 30, 'n_estimators': 200}, {'max_depth': 30, 'n_estimators': 300}, {'max_depth': 30, 'n_estimators': 500}, {'max_depth': 30, 'n_estimators': 800}, {'max_depth': 30, 'n_estimators': 1200}], 'split0_test_score': array([0.79820628, 0.79820628, 0.79820628, 0.79820628, 0.79820628,
       0.79820628, 0.75784753, 0.77130045, 0.75784753, 0.76233184,
       0.75784753, 0.75784753, 0.78026906, 0.78026906, 0.78026906,
       0.78475336, 0.78475336, 0.78475336, 0.79372197, 0.78026906,
       0.78475336, 0.78475336, 0.78026906, 0.78026906, 0.78475336,
       0.78026906, 0.78026906, 0.78475336, 0.78026906, 0.78475336]), 'split1_test_score': array([0.81165919, 0.81165919, 0.81165919, 0.81165919, 0.81165919,
       0.81165919, 0.79820628, 0.80269058, 0.80269058, 0.80269058,
       0.79372197, 0.79372197, 0.79820628, 0.78026906, 0.78475336,
       0.78026906, 0.78026906, 0.78026906, 0.78026906, 0.78026906,
       0.78026906, 0.78026906, 0.78026906, 0.78026906, 0.78026906,
       0.78026906, 0.78026906, 0.78026906, 0.78026906, 0.78026906]), 'split2_test_score': array([0.83783784, 0.83783784, 0.83333333, 0.83783784, 0.83783784,
       0.81081081, 0.85135135, 0.82432432, 0.83783784, 0.83333333,
       0.83783784, 0.84684685, 0.81081081, 0.82432432, 0.82882883,
       0.82882883, 0.84684685, 0.82882883, 0.85135135, 0.84684685,
       0.82882883, 0.82882883, 0.84234234, 0.84234234, 0.82432432,
       0.81531532, 0.82432432, 0.82882883, 0.84684685, 0.82882883]), 'mean_test_score': array([0.8159011 , 0.8159011 , 0.8143996 , 0.8159011 , 0.8159011 ,
       0.80689209, 0.80246839, 0.79943845, 0.79945865, 0.79945192,
       0.79646911, 0.79947212, 0.79642872, 0.79495415, 0.79795042,
       0.79795042, 0.80395642, 0.79795042, 0.80844746, 0.80246165,
       0.79795042, 0.79795042, 0.80096015, 0.80096015, 0.79644892,
       0.79195114, 0.79495415, 0.79795042, 0.80246165, 0.79795042]), 'std_test_score': array([0.0164552 , 0.0164552 , 0.01447089, 0.0164552 , 0.0164552 ,
       0.00615156, 0.03829156, 0.02176871, 0.03273577, 0.02907656,
       0.03271363, 0.03656061, 0.01253181, 0.02076785, 0.02191095,
       0.02191095, 0.03038331, 0.02191095, 0.03083075, 0.03138507,
       0.02191095, 0.02191095, 0.02926163, 0.02926163, 0.01979572,
       0.01652096, 0.02076785, 0.02191095, 0.03138507, 0.02191095]), 'rank_test_score': array([ 1,  1,  5,  1,  1,  7,  9, 17, 15, 16, 25, 14, 27, 28, 18, 18,  8,
       18,  6, 10, 18, 18, 12, 12, 26, 30, 28, 18, 10, 18])}

你可能感兴趣的:(python数据挖掘,python,机器学习,开发语言)