Titanic-乘客获救预测1

代码中数据集:https://github.com/jsusu/Titanic_passenger-survival-prediction/tree/master/titanic_data

# Titanic乘客生存预测1

#数据分析库
import pandas as pd
#科学计算库
import numpy as np 
from pandas import Series,DataFrame

# 1.获取数据样本
data_train = pd.read_csv("./titanic_data/titanic_train.csv")
data_test = pd.read_csv("./titanic_data/titanic_test.csv")
# 2.数据处理
data_train.head(10)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
data_test.head(10)
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
5 897 3 Svensson, Mr. Johan Cervin male 14.0 0 0 7538 9.2250 NaN S
6 898 3 Connolly, Miss. Kate female 30.0 0 0 330972 7.6292 NaN Q
7 899 2 Caldwell, Mr. Albert Francis male 26.0 1 1 248738 29.0000 NaN S
8 900 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu) female 18.0 0 0 2657 7.2292 NaN C
9 901 3 Davies, Mr. John Samuel male 21.0 2 0 A/4 48871 24.1500 NaN S
data_train.info()
#从上面数据我们可以看到Age,Cabin和Embarked列的数据有缺失,
# Age列共有714条数据,缺失117条数据;Cabin列有204条数据,缺失687条数据;E
# mbarked列有889条数据,缺失2条数据,其他列都是891条数据;

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
data_train.describe()
#mean字段告诉我们,大概0.383838的人最后获救了,2/3等舱的人数比1等舱要多,平均乘客年龄大概是29.7岁(计算这个时候会略掉无记录的)等等
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
# 3.特征选取

# 3.1 数据空值处理
# 客舱号Cabin列由于存在大量的空值,如果直接对空值进行填空,带来的误差影响会比较大,先不选用Cabin列做特征
# 年龄列对于是否能够存活的判断很重要,采用Age均值对空值进行填充
# PassengerId是一个连续的序列,对于是否能够存活的判断无关,不选用PassengerId作为特征

#Age列中的缺失值用Age中位数进行填充
data_train["Age"] = data_train['Age'].fillna(data_train['Age'].median())  
data_train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
# 4. 线性回归算法

#线性回归
from sklearn.linear_model import LinearRegression   
#训练集交叉验证,得到平均值
#from sklearn.cross_validation import KFold 
from sklearn.model_selection import KFold
 
#选取简单的可用输入特征
predictors = ["Pclass","Age","SibSp","Parch","Fare"]     
 
#初始化现行回归算法
alg = LinearRegression()
#样本平均分成3份,3折交叉验证
#kf = KFold(data_train.shape[0],n_folds=3,random_state=1)   
kf = KFold(n_splits=3,shuffle=False,random_state=1) 

predictions = []
for train,test in kf.split(data_train):
    #The predictors we're using to train the algorithm.  Note how we only take then rows in the train folds.
    train_predictors = (data_train[predictors].iloc[train,:])
    #The target we're using to train the algorithm.
    train_target = data_train["Survived"].iloc[train]
    #Training the algorithm using the predictors and target.
    alg.fit(train_predictors,train_target)
    #We can now make predictions on the test fold
    test_predictions = alg.predict(data_train[predictors].iloc[test,:])
    predictions.append(test_predictions)

/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning
import numpy as np
 
#The predictions are in three aeparate numpy arrays.	Concatenate them into one.
#We concatenate them on axis 0,as they only have one axis.
predictions = np.concatenate(predictions,axis=0)
 
#Map predictions to outcomes(only possible outcomes are 1 and 0)
predictions[predictions>.5] = 1
predictions[predictions<=.5] = 0
accuracy = sum(predictions == data_train["Survived"]) / len(predictions)
print ("准确率为: ", accuracy)
准确率为:  0.7037037037037037
# 5.逻辑回归算法
from sklearn import model_selection
#逻辑回归
from sklearn.linear_model import LogisticRegression   

#初始化逻辑回归算法
LogRegAlg=LogisticRegression(random_state=1)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])

#使用sklearn库里面的交叉验证函数获取预测准确率分数
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)

#使用交叉验证分数的平均值作为最终的准确率
print("准确率为: ",scores.mean())
准确率为:  0.7003367003367004
# 5.1 增加特征Sex和Embarked列,查看对预测的影响
# 对性别Sex列和登船港口Embarked列进行字符处理
data_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
#Sex性别列处理:male用0,female用1
data_train.loc[data_train["Sex"] == "male","Sex"] = 0
data_train.loc[data_train["Sex"] == "female","Sex"] = 1
#缺失值用最多的S进行填充
data_train["Embarked"] = data_train["Embarked"].fillna('S') 
#地点用0,1,2
data_train.loc[data_train["Embarked"] == "S","Embarked"] = 0    
data_train.loc[data_train["Embarked"] == "C","Embarked"] = 1
data_train.loc[data_train["Embarked"] == "Q","Embarked"] = 2
# 增加2个特征Sex和Embarked,继续使用逻辑回归算法进行预测
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]  

LogRegAlg=LogisticRegression(random_state=1)
#Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)
#Take the mean of the scores (because we have one for each fold)
print("准确率为: ",scores.mean())
准确率为:  0.7957351290684623


/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
# 通过增加了2个特征,模型的准确率提高到78.78%,说明好的特征有利于提升模型的预测能力。
data_test.describe()
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
data_test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
#新增:对测试集数据进行预处理,并进行结果预测
#Age列中的缺失值用Age均值进行填充
data_test["Age"] = data_test["Age"].fillna(data_test["Age"].median())
#Fare列中的缺失值用Fare最大值进行填充
data_test["Fare"] = data_test["Fare"].fillna(data_test["Fare"].max()) 

#Sex性别列处理:male用0,female用1
data_test.loc[data_test["Sex"] == "male","Sex"] = 0
data_test.loc[data_test["Sex"] == "female","Sex"] = 1
#缺失值用最多的S进行填充
data_test["Embarked"] = data_test["Embarked"].fillna('S') 
#地点用0,1,2
data_test.loc[data_test["Embarked"] == "S","Embarked"] = 0    
data_test.loc[data_test["Embarked"] == "C","Embarked"] = 1
data_test.loc[data_test["Embarked"] == "Q","Embarked"] = 2

test_features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"] 
#构造测试集的Survived列,
data_test["Survived"] = -1

test_predictors = data_test[test_features]
data_test["Survived"] = LogRegAlg.predict(test_predictors)
data_test.head(10)
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Survived
0 892 3 Kelly, Mr. James 0 34.5 0 0 330911 7.8292 NaN 2 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0 1 0 363272 7.0000 NaN 0 0
2 894 2 Myles, Mr. Thomas Francis 0 62.0 0 0 240276 9.6875 NaN 2 0
3 895 3 Wirz, Mr. Albert 0 27.0 0 0 315154 8.6625 NaN 0 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0 1 1 3101298 12.2875 NaN 0 1
5 897 3 Svensson, Mr. Johan Cervin 0 14.0 0 0 7538 9.2250 NaN 0 0
6 898 3 Connolly, Miss. Kate 1 30.0 0 0 330972 7.6292 NaN 2 1
7 899 2 Caldwell, Mr. Albert Francis 0 26.0 1 1 248738 29.0000 NaN 0 0
8 900 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu) 1 18.0 0 0 2657 7.2292 NaN 1 1
9 901 3 Davies, Mr. John Samuel 0 21.0 2 0 A/4 48871 24.1500 NaN 0 0
# 6.使用随机森林算法
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
 
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
 

#10棵决策树,停止的条件:样本个数为2,叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1) 

#Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
#kf=cross_validation.KFold(data_train.shape[0],n_folds=3,random_state=1)
kf=model_selection.KFold(n_splits=3,shuffle=False, random_state=1)


scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
print(scores)
#Take the mean of the scores (because we have one for each fold)
print(scores.mean())
[0.75420875 0.8013468  0.8013468 ]
0.7856341189674523


/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning
#增加决策树的个数到30棵决策树,交叉验证方法采用10折交叉验证
#30棵决策树,停止的条件:样本个数为2,叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=30,min_samples_split=2,min_samples_leaf=1) 

#Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
#kf=cross_validation.KFold(data_train.shape[0],n_folds=10,random_state=1)
kf=model_selection.KFold(n_splits=10,shuffle=False,random_state=1)

scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
 
print(scores)
#Take the mean of the scores (because we have one for each fold)
print(scores.mean())
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning


[0.74444444 0.80898876 0.78651685 0.82022472 0.85393258 0.85393258
 0.7752809  0.7752809  0.84269663 0.85393258]
0.8115230961298376

你可能感兴趣的:(Titanic-乘客获救预测1)