过程都在代码里:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv(r'...\train.csv')
test = pd.read_csv(r'...\test.csv')
train.info()
#----------------------------know about the data----------------------------------------------#
sns.set_style('whitegrid')
# The following Heatmap will reveal the missing values.
# White lines indicate the missing values.
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap = "Blues")
# Checking how many survived vs. how many did not with respect to gender.
sns.countplot(x = 'Survived', hue = 'Sex', data = train, palette = 'RdBu_r')
# Checking how many survived vs. how many did not with respect to class.
sns.countplot(x = 'Survived', hue = 'Pclass', data = train, palette = 'rainbow')
# Checking the distribution of age
sns.distplot(train['Age'].dropna(), kde = True, color='blue', bins = 30)
# Checking the age groups of the people within each class.
# Grouped into classes
plt.figure(figsize = (12, 7))
sns.boxplot(x = "Pclass", y = "Age", data = train, palette = 'winter')
# Plotting people who came in groups or alone
sns.countplot(x = 'SibSp', data = train)
# Plotting the Fare column
sns.countplot(x = 'Fare', data = train)
#A better representation for the above distribution using pandas
train['Fare'].hist(bins=30,figsize=(10,4))
# And lastly, distribution for Parch
sns.countplot(x = 'Parch', data = train)
#-------------------------deal with na value-----------------------------------------------#
#each class have diff age distribution
print(train.groupby(['Pclass'])['Age'].mean())
# Imputing the Age Column
def AgeImputation(column):
Age = column[0]
Pclass = column[1]
if pd.isnull(Age):
if Pclass == 1:
return 38
elif Pclass == 2:
return 29
else:
return 25
else:
return Age
#replace the na value of age into mean of each class
train['Age'] = train[['Age','Pclass']].apply(AgeImputation,axis=1)
test['Age'] = test[['Age','Pclass']].apply(AgeImputation,axis=1)
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Greens")
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Greens")
# Dropping the Cabin column because it has too many missing values. Imputing wont give accurate representation for the data.
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
#find out na value
print("train data:", np.sum(test.isnull(), axis = 0))
print("test data:", np.sum(train.isnull(), axis = 0))
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Greens")
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Greens")
# Lastly, dealing with the Embarked Column.
# We're dropping the rows containing null values for any column column in the Training Set
train.dropna(inplace=True)
# fill (instead of drop) the missing value of Fare with the mean of Fares
# so that there are exactly 418 rows (required for submission)
mean = test['Fare'].mean()
test['Fare'].fillna(mean, inplace=True)
# All missing values have been taken care of.
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="Blues")
# All missing values have been taken care of.
sns.heatmap(test.isnull(),yticklabels=False,cbar=False,cmap="Blues")
# convert categorical variables into dummy/indicator variables
# drop_first drops one column to remove multi-colinearity i.e one or more columns predicting the other
train.columns
train.info()
test.info()
# object value
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
# dropping the Name and Ticket columns because they have no role in the model training and prediction
# dropping the Sex and Embarked columns to replace them with the new columns with dummy variables
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train = pd.concat([train,sex,embark],axis=1)
# Since passenger id wont give any information about their survival
train.drop(['PassengerId'],axis=1,inplace=True)
# Repeating the above process for test
sex = pd.get_dummies(test['Sex'],drop_first=True)
embark = pd.get_dummies(test['Embarked'],drop_first=True)
test.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
test = pd.concat([test,sex,embark],axis=1)
P_ID = test['PassengerId'] # Saving for later
test.drop(['PassengerId'],axis=1,inplace=True)
#-------------------------------------preparing the dataset for ML------------------------------------#
from sklearn.model_selection import train_test_split
X = train.drop('Survived', axis = 1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=101)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
survived_test = logmodel.predict(X_test)
survived_train = logmodel.predict(X_train)
print("the accuracy of test data:", np.mean(survived_test == y_test))
print("the accuracy of test data:", np.mean(survived_train == y_train))
survived = logmodel.predict(test)
test['Survived'] = survived
test['PassengerId'] = P_ID
gender_submission = pd.read_csv(r'gender_submission.csv')
vali = pd.merge(gender_submission, test[['PassengerId', 'Survived']], how = 'left', on = 'PassengerId')
test[['PassengerId', 'Survived']].to_csv('First_Logistic_Regression.csv', index=False)