Python:titanic的预测模型

前言:整个6月都在学习统计学,终于考过了cda level1,也终于开始正式机器学习
开始看唐宇迪的《python数据分析与机器学习实战》视频,由于是2017年录制的视频且用的是python 2.7,所以里面的一些代码已经不适用了,根据最新的3.6改了一下

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re

#1、数据导入
pd.set_option('display.max_columns', None) #显示所有列
titanic = pd.read_csv('titanic_train.csv')

#2、数据清洗
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
# print(titanic['Sex'].unique())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
# print(titanic['Embarked'].unique())
# print(titanic['Embarked'].value_counts())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
# print(titanic)

#3、线性回归 精度:0.78338
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.model_selection import KFold #交叉验证

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = LinearRegression()
kf = KFold(n_splits=3, random_state=1).split(titanic)
predictions = []
for train, test in kf:
    train_predictors = titanic[predictors].iloc[train, :]
    # print(train_predictors)
    train_target = titanic['Survived'].iloc[train]
    # print(train_target)
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test, :])
    # print(titanic[predictors].iloc[test, :])
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = len(predictions[predictions == titanic['Survived']]) / len(predictions)
print(accuracy)

#4、逻辑回归 精度:0.78787
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression #逻辑回归

alg = LogisticRegression(random_state=1, solver='liblinear')
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
print(scores.mean())

#测试集预测(因为测试集结果没有答案,所以只能先预测结果,无法做验证)
# titanic_test = pd.read_csv('test.csv')
# titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test['Age'].median())
# titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())
# titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0
# titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1
# titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
# titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0
# titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1
# titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2
# alg.fit(titanic[predictors], titanic['Survived'])
# test_value = alg.predict(titanic_test[predictors])
# print(test_value)

#5、随机森林 精度:0.82379
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=3)
kf = model_selection.KFold(n_splits=3, random_state=1).split(titanic)
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())

#6、重新选择参数
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

titles = titanic['Name'].apply(get_title)
title_mapping = {
     'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Major': 7, 'Col': 7, 'Mlle': 8,
                 'Mme': 9, 'Lady': 10, 'Countess': 11, 'Capt': 12, 'Don': 13, 'Sir': 14, 'Jonkheer': 15, 'Ms': 16}
for k, v in title_mapping.items():
    titles[titles == k] = v
# print(pd.value_counts(titles))
titanic['Title'] = titles

#7、重新挑选参数(最后剔除了部分参数)
from sklearn.feature_selection import SelectKBest, f_classif

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title', 'NameLength']
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])
scores = -np.log10(selector.pvalues_)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

#8、梯度提升决策树分类+逻辑回归集成 精度:0.82154
from sklearn.ensemble import GradientBoostingClassifier
algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'Title', 'Age', 'Embarked']],
    [LogisticRegression(random_state=1, solver='liblinear'), ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'Title', 'Age', 'Embarked']]
]
kf = KFold(n_splits=3, random_state=1).split(titanic)
predictions = []
for train, test in kf:
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
accuracy = len(predictions[predictions == titanic['Survived']]) / len(predictions)
print(accuracy)

你可能感兴趣的:(机器学习,机器学习)