Kaggle比赛赛题《Titanic---Logistic Regression》

过程都在代码里:

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv(r'...\train.csv')
test = pd.read_csv(r'...\test.csv')
train.info()
#----------------------------know about the data----------------------------------------------#
sns.set_style('whitegrid')
# The following Heatmap will reveal the missing values. 
# White lines indicate the missing values.
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap = "Blues")
# Checking how many survived vs. how many did not with respect to gender.
sns.countplot(x = 'Survived', hue = 'Sex', data = train, palette = 'RdBu_r')
# Checking how many survived vs. how many did not with respect to class.
sns.countplot(x = 'Survived', hue = 'Pclass', data = train, palette = 'rainbow') 
# Checking the distribution of age
sns.distplot(train['Age'].dropna(), kde = True, color='blue', bins = 30)
# Checking the age groups of the people within each class. 
# Grouped into classes
plt.figure(figsize = (12, 7))
sns.boxplot(x = "Pclass", y = "Age", data = train, palette = 'winter')
# Plotting people who came in groups or alone
sns.countplot(x = 'SibSp', data = train)
# Plotting the Fare column
sns.countplot(x = 'Fare', data = train)
#A better representation for the above distribution using pandas
train['Fare'].hist(bins=30,figsize=(10,4))
# And lastly, distribution for Parch
sns.countplot(x = 'Parch', data = train)

#-------------------------deal with na value-----------------------------------------------#
#each class have diff age distribution
print(train.groupby(['Pclass'])['Age'].mean())
# Imputing the Age Column
def AgeImputation(column):
    Age = column[0]
    Pclass = column[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 29
        else:
            return 25
    else:
        return Age
#replace the na value of age into mean of each class   
train['Age'] = train[['Age','Pclass']].apply(AgeImputation,axis=1)
test['Age'] = test[['Age','Pclass']].apply(AgeImputation,axis=1)

sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Greens")
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Greens")
# Dropping the Cabin column because it has too many missing values. Imputing wont give accurate representation for the data.
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
#find out na value
print("train data:", np.sum(test.isnull(), axis = 0))
print("test data:", np.sum(train.isnull(), axis = 0))
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Greens")
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Greens")
# Lastly, dealing with the Embarked Column. 
# We're dropping the rows containing null values for any column column in the Training Set
train.dropna(inplace=True) 

# fill (instead of drop) the missing value of Fare with the mean of Fares
# so that there are exactly 418 rows (required for submission)
mean = test['Fare'].mean()
test['Fare'].fillna(mean, inplace=True) 
# All missing values have been taken care of.
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Blues")
# All missing values have been taken care of.
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Blues")

# convert categorical variables into dummy/indicator variables
# drop_first drops one column to remove multi-colinearity i.e one or more columns predicting the other
train.columns
train.info()
test.info()
# object value  
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

# dropping the Name and Ticket columns because they have no role in the model training and prediction
# dropping the Sex and Embarked columns to replace them with the new columns with dummy variables
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train = pd.concat([train,sex,embark],axis=1)
# Since passenger id wont give any information about their survival
train.drop(['PassengerId'],axis=1,inplace=True)

# Repeating the above process for test
sex = pd.get_dummies(test['Sex'],drop_first=True)
embark = pd.get_dummies(test['Embarked'],drop_first=True)
test.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
test = pd.concat([test,sex,embark],axis=1)
P_ID = test['PassengerId'] # Saving for later
test.drop(['PassengerId'],axis=1,inplace=True)

#-------------------------------------preparing the dataset for ML------------------------------------#
from sklearn.model_selection import train_test_split

X = train.drop('Survived', axis = 1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=101)

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
survived_test = logmodel.predict(X_test)
survived_train = logmodel.predict(X_train)
print("the accuracy of test data:", np.mean(survived_test == y_test))
print("the accuracy of test data:", np.mean(survived_train == y_train))

survived = logmodel.predict(test)
test['Survived'] = survived
test['PassengerId'] = P_ID
gender_submission  = pd.read_csv(r'gender_submission.csv')
vali = pd.merge(gender_submission, test[['PassengerId', 'Survived']], how = 'left', on = 'PassengerId')

test[['PassengerId', 'Survived']].to_csv('First_Logistic_Regression.csv', index=False)









你可能感兴趣的:(Kaggle比赛赛题《Titanic---Logistic Regression》)