建模之泰坦尼克生存预测

作者:北有鸣鹿
个人辛苦之作,请勿随意转载,必追究责任。如需转载,请联系我

#https://www.kaggle.com/c/titanic
#本次建模的模型的最高准确率0.9665,AUC为0.9666
#1.导入数据集
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("all/train.csv", header=0)
test = pd.read_csv("all/test.csv")

#2.查看数据集整体信息
#查看变量缺失值信息
train.info()
test.info()
#显示前行具体信息观察数据
train.head()
test.head()

#变量名称及意义等
#Variable Definition   Key
#Survived     生存      0 = 否, 1 = 是
#Pclass     票务类    1 = 1, 2 = 2, 3 = 3
#Name       姓名
#Sex         性
#Age        年龄 (年)
#SibSp      泰坦尼克号上的兄弟姐妹/配偶人数
#Parch       泰坦尼克号上的父母/孩子人数
#Ticket       票号
#Fare        乘客票价
#Cabin       客舱编号
#Embarked    登港    c = 瑟堡, q = 皇后镇, s = 南安普敦

#统计信息描述
train.describe()
test.describe()

#查看变量类型
train.dtypes
test.dtypes

#查看缺失信息
train.isnull().sum()
test.isnull().sum()

#查看目标变量的0,1比例,看是否为不平衡数据
count_train = pd.value_counts(train['Survived'], sort=True).sort_index()
count_train.value_counts()
count_train.plot(kind='bar')
plt.show()

#3.处理个字段信息
#(1)删除不需要的字段
train = train.drop(['PassengerId','Name','Ticket'], axis=1)
test = test.drop(['PassengerId','Name','Ticket'], axis=1)
#(2)处理Sex(性别)字段,女0,男1
train['Sex'] = train['Sex'].map({'female':0, 'male':1}).astype(int)
test['Sex'] = test['Sex'].map({'female':0, 'male':1}).astype(int)

#(3)处理Age字段
train['Age'].describe()
train['Age'].isnull().sum()
#直方图
train['Age'].dropna().hist(bins=16,range=(0,80),alpha=.5)
plt.show()
#用均值填充,并标准化数据
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
from sklearn.preprocessing import StandardScaler
train['Age'] = StandardScaler().fit_transform(
        train['Age'].values.reshape(-1,1))
test['Age'] = StandardScaler().fit_transform(
        test['Age'].values.reshape(-1,1))

#(4)处理客舱编号这个变量,缺失过多,将其变为哑变量,有为1,缺失为0
train['Cabin'] = train['Cabin'].replace(np.nan,'unknown')
Cabin_01 = []
for i in train['Cabin']:
    if i == 'unknown':
        i = 0
        Cabin_01.append(i)
    else:
        i = 1
        Cabin_01.append(i)
train['Cabin'] = Cabin_01

test['Cabin'] = test['Cabin'].replace(np.nan,'unknown')
Cabin_01 = []
for i in test['Cabin']:
    if i == 'unknown':
        i = 0
        Cabin_01.append(i)
    else:
        i = 1
        Cabin_01.append(i)
test['Cabin'] = Cabin_01
       
#(5)处理Embarked
pd.value_counts(train['Embarked']) #查看众数
train['Embarked'] = train['Embarked'].replace(np.nan,'S') #用众数填补缺失值
#转换为哑变量
train = train.join(pd.get_dummies(train['Embarked'], prefix='Embarked'))      
test = test.join(pd.get_dummies(test['Embarked'], prefix='Embarked')) 
train = train.drop(['Embarked'], axis=1)
test = test.drop(['Embarked'], axis=1)

#(6)测试集的Fare缺失一个,用均值填充,并标准化数据
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
from sklearn.preprocessing import StandardScaler
train['Fare'] = StandardScaler().fit_transform(
        train['Fare'].values.reshape(-1,1))
test['Fare'] = StandardScaler().fit_transform(
        test['Fare'].values.reshape(-1,1))

#观察离群值
plt.boxplot(train['Age'])
plt.show()
plt.boxplot(train['Fare'])
plt.show()

plt.boxplot(test['Age'])
plt.show()
plt.boxplot(test['Fare'])
plt.show()

#处理离群值,盖帽法,低于5%的用5%分位数的值代替,高于95%分位数的值用#95%分位数代替。
names = ['Age','Fare']
for name in names:
    fs = train[name]
    min_5 = np.percentile(fs,(5))
    max_95 = np.percentile(fs,(95))
    fs1 = []
    for f in fs:
        if f < min_5:
            fs1.append(min_5)
        elif f > max_95:
            fs1.append(max_95)
        else:
            fs1.append(f)
    train[name] = fs1 

for name in names:
    fs = test[name]
    min_5 = np.percentile(fs,(5))
    max_95 = np.percentile(fs,(95))
    fs1 = []
    for f in fs:
        if f < min_5:
            fs1.append(min_5)
        elif f > max_95:
            fs1.append(max_95)
        else:
            fs1.append(f)
    test[name] = fs1

#检验相关性
from scipy import stats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#连续变量与类别变量(T检验(二分类))
stats.ttest_ind(train['Age'],train['Survived'], equal_var=False)
stats.ttest_ind(train['Fare'],train['Survived'], equal_var=False)
#类别与类别
features = [x for x in train.columns
            if x not in ['Age','Fare','Survived']]
model = SelectKBest(chi2, k=2)
model.fit_transform(train[features], train['Survived'])
print(model.pvalues_)

#训练集和测试集
y_train = train.Survived
x_train = train.drop(['Survived'], axis=1)
x_test = test
y_test = pd.read_csv("all/gender_submission.csv")
y_test = y_test.Survived

#根据相关性差的变量剔除
x_train = x_train.drop(['Embarked_Q'], axis=1)
x_test = x_test.drop(['Embarked_Q'], axis=1)

#模型一,逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lrmodel = LogisticRegression(penalty='l2')
lrmodel.fit(x_train, y_train)
ypred_lr = lrmodel.predict(x_test)
print('confusion_matrix')
print(metrics.confusion_matrix(y_test,ypred_lr))
#查看分类报告
print('classification_report')
print(metrics.classification_report(y_test,ypred_lr))
#查看预测精度与决策覆盖面
print('Accuracy:%f'%(metrics.accuracy_score(y_test,ypred_lr)))
print('Area under the curve:%f'%(metrics.roc_auc_score(y_test,ypred_lr)))

#模型二:随机森林
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators=100)
rfmodel.fit(x_train, y_train)
ypred_rf = rfmodel.predict(x_test)
print('confusion_matrix')
print(metrics.confusion_matrix(y_test,ypred_rf))
#查看分类报告
print('classification_report')
print(metrics.classification_report(y_test,ypred_rf))
#查看预测精度与决策覆盖面
print('Accuracy:%f'%(metrics.accuracy_score(y_test,ypred_rf)))
print('Area under the curve:%f'%(metrics.roc_auc_score(y_test,ypred_rf)))

#模型三:K近邻
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
#查看混淆矩阵
ypred_knn = knn.predict(x_test)
print('confusion_matrix')
print(metrics.confusion_matrix(y_test,ypred_knn))
#查看分类报告
print('classification_report')
print(metrics.classification_report(y_test,ypred_knn))
#查看预测精度与决策覆盖面(AUC)
print('Accuracy:%f'%(metrics.accuracy_score(y_test,ypred_knn)))
print('Area under the curve:%f'%(metrics.roc_auc_score(y_test,ypred_knn)))

#模型四:线性判别
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
lda.fit(x_train, y_train)
ypred_lda = lda.predict(x_test)
print('confusion_matrix')
print(metrics.confusion_matrix(y_test,ypred_lda))
#查看分类报告
print('classification_report')
print(metrics.classification_report(y_test,ypred_lda))
#查看预测精度与决策覆盖面
print('Accuracy:%f'%(metrics.accuracy_score(y_test,ypred_lda))) 
print('Area under the curve:%f'%(metrics.roc_auc_score(y_test,ypred_lda)))

y = pd.read_csv("all/gender_submission.csv")
y['Survived'] = ypred_lda
#y.to_csv('all/predict.csv')

#模型五:二次判别
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis(store_covariances=True)
qda.fit(x_train, y_train)
ypred_qda = qda.predict(x_test)
print('confusion_matrix')
print(metrics.confusion_matrix(y_test,ypred_qda))
#查看分类报告
print('classification_report')
print(metrics.classification_report(y_test,ypred_qda))
#查看预测精度与决策覆盖面
print('Accuracy:%f'%(metrics.accuracy_score(y_test,ypred_qda)))
print('Area under the curve:%f'%(metrics.roc_auc_score(y_test,ypred_qda)))

改进建议,年龄变量可以用不同的方法填充,且删除的姓名等可以处理添加

你可能感兴趣的:(建模实例)