代码中数据集:https://github.com/jsusu/Titanic_passenger-survival-prediction/tree/master/titanic_data
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
data_train = pd.read_csv("./titanic_data/titanic_train.csv")
data_test = pd.read_csv("./titanic_data/titanic_test.csv")
data_train.head(10)
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
5 |
6 |
0 |
3 |
Moran, Mr. James |
male |
NaN |
0 |
0 |
330877 |
8.4583 |
NaN |
Q |
6 |
7 |
0 |
1 |
McCarthy, Mr. Timothy J |
male |
54.0 |
0 |
0 |
17463 |
51.8625 |
E46 |
S |
7 |
8 |
0 |
3 |
Palsson, Master. Gosta Leonard |
male |
2.0 |
3 |
1 |
349909 |
21.0750 |
NaN |
S |
8 |
9 |
1 |
3 |
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) |
female |
27.0 |
0 |
2 |
347742 |
11.1333 |
NaN |
S |
9 |
10 |
1 |
2 |
Nasser, Mrs. Nicholas (Adele Achem) |
female |
14.0 |
1 |
0 |
237736 |
30.0708 |
NaN |
C |
data_test.head(10)
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
892 |
3 |
Kelly, Mr. James |
male |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
Q |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
female |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
S |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
3 |
895 |
3 |
Wirz, Mr. Albert |
male |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
S |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
5 |
897 |
3 |
Svensson, Mr. Johan Cervin |
male |
14.0 |
0 |
0 |
7538 |
9.2250 |
NaN |
S |
6 |
898 |
3 |
Connolly, Miss. Kate |
female |
30.0 |
0 |
0 |
330972 |
7.6292 |
NaN |
Q |
7 |
899 |
2 |
Caldwell, Mr. Albert Francis |
male |
26.0 |
1 |
1 |
248738 |
29.0000 |
NaN |
S |
8 |
900 |
3 |
Abrahim, Mrs. Joseph (Sophie Halaut Easu) |
female |
18.0 |
0 |
0 |
2657 |
7.2292 |
NaN |
C |
9 |
901 |
3 |
Davies, Mr. John Samuel |
male |
21.0 |
2 |
0 |
A/4 48871 |
24.1500 |
NaN |
S |
data_train.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
data_train.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
count |
891.000000 |
891.000000 |
891.000000 |
714.000000 |
891.000000 |
891.000000 |
891.000000 |
mean |
446.000000 |
0.383838 |
2.308642 |
29.699118 |
0.523008 |
0.381594 |
32.204208 |
std |
257.353842 |
0.486592 |
0.836071 |
14.526497 |
1.102743 |
0.806057 |
49.693429 |
min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
25% |
223.500000 |
0.000000 |
2.000000 |
20.125000 |
0.000000 |
0.000000 |
7.910400 |
50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
75% |
668.500000 |
1.000000 |
3.000000 |
38.000000 |
1.000000 |
0.000000 |
31.000000 |
max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
data_train["Age"] = data_train['Age'].fillna(data_train['Age'].median())
data_train.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
count |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
mean |
446.000000 |
0.383838 |
2.308642 |
29.361582 |
0.523008 |
0.381594 |
32.204208 |
std |
257.353842 |
0.486592 |
0.836071 |
13.019697 |
1.102743 |
0.806057 |
49.693429 |
min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
25% |
223.500000 |
0.000000 |
2.000000 |
22.000000 |
0.000000 |
0.000000 |
7.910400 |
50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
75% |
668.500000 |
1.000000 |
3.000000 |
35.000000 |
1.000000 |
0.000000 |
31.000000 |
max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ["Pclass","Age","SibSp","Parch","Fare"]
alg = LinearRegression()
kf = KFold(n_splits=3,shuffle=False,random_state=1)
predictions = []
for train,test in kf.split(data_train):
train_predictors = (data_train[predictors].iloc[train,:])
train_target = data_train["Survived"].iloc[train]
alg.fit(train_predictors,train_target)
test_predictions = alg.predict(data_train[predictors].iloc[test,:])
predictions.append(test_predictions)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
import numpy as np
predictions = np.concatenate(predictions,axis=0)
predictions[predictions>.5] = 1
predictions[predictions<=.5] = 0
accuracy = sum(predictions == data_train["Survived"]) / len(predictions)
print ("准确率为: ", accuracy)
准确率为: 0.7037037037037037
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
LogRegAlg=LogisticRegression(random_state=1)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)
print("准确率为: ",scores.mean())
准确率为: 0.7003367003367004
data_train.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
data_train.loc[data_train["Sex"] == "male","Sex"] = 0
data_train.loc[data_train["Sex"] == "female","Sex"] = 1
data_train["Embarked"] = data_train["Embarked"].fillna('S')
data_train.loc[data_train["Embarked"] == "S","Embarked"] = 0
data_train.loc[data_train["Embarked"] == "C","Embarked"] = 1
data_train.loc[data_train["Embarked"] == "Q","Embarked"] = 2
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
LogRegAlg=LogisticRegression(random_state=1)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)
print("准确率为: ",scores.mean())
准确率为: 0.7957351290684623
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
data_test.describe()
|
PassengerId |
Pclass |
Age |
SibSp |
Parch |
Fare |
count |
418.000000 |
418.000000 |
332.000000 |
418.000000 |
418.000000 |
417.000000 |
mean |
1100.500000 |
2.265550 |
30.272590 |
0.447368 |
0.392344 |
35.627188 |
std |
120.810458 |
0.841838 |
14.181209 |
0.896760 |
0.981429 |
55.907576 |
min |
892.000000 |
1.000000 |
0.170000 |
0.000000 |
0.000000 |
0.000000 |
25% |
996.250000 |
1.000000 |
21.000000 |
0.000000 |
0.000000 |
7.895800 |
50% |
1100.500000 |
3.000000 |
27.000000 |
0.000000 |
0.000000 |
14.454200 |
75% |
1204.750000 |
3.000000 |
39.000000 |
1.000000 |
0.000000 |
31.500000 |
max |
1309.000000 |
3.000000 |
76.000000 |
8.000000 |
9.000000 |
512.329200 |
data_test.head()
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
892 |
3 |
Kelly, Mr. James |
male |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
Q |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
female |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
S |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
3 |
895 |
3 |
Wirz, Mr. Albert |
male |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
S |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
data_test["Age"] = data_test["Age"].fillna(data_test["Age"].median())
data_test["Fare"] = data_test["Fare"].fillna(data_test["Fare"].max())
data_test.loc[data_test["Sex"] == "male","Sex"] = 0
data_test.loc[data_test["Sex"] == "female","Sex"] = 1
data_test["Embarked"] = data_test["Embarked"].fillna('S')
data_test.loc[data_test["Embarked"] == "S","Embarked"] = 0
data_test.loc[data_test["Embarked"] == "C","Embarked"] = 1
data_test.loc[data_test["Embarked"] == "Q","Embarked"] = 2
test_features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
data_test["Survived"] = -1
test_predictors = data_test[test_features]
data_test["Survived"] = LogRegAlg.predict(test_predictors)
data_test.head(10)
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
Survived |
0 |
892 |
3 |
Kelly, Mr. James |
0 |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
2 |
0 |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
1 |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
0 |
0 |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
0 |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
2 |
0 |
3 |
895 |
3 |
Wirz, Mr. Albert |
0 |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
0 |
0 |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
1 |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
0 |
1 |
5 |
897 |
3 |
Svensson, Mr. Johan Cervin |
0 |
14.0 |
0 |
0 |
7538 |
9.2250 |
NaN |
0 |
0 |
6 |
898 |
3 |
Connolly, Miss. Kate |
1 |
30.0 |
0 |
0 |
330972 |
7.6292 |
NaN |
2 |
1 |
7 |
899 |
2 |
Caldwell, Mr. Albert Francis |
0 |
26.0 |
1 |
1 |
248738 |
29.0000 |
NaN |
0 |
0 |
8 |
900 |
3 |
Abrahim, Mrs. Joseph (Sophie Halaut Easu) |
1 |
18.0 |
0 |
0 |
2657 |
7.2292 |
NaN |
1 |
1 |
9 |
901 |
3 |
Davies, Mr. John Samuel |
0 |
21.0 |
2 |
0 |
A/4 48871 |
24.1500 |
NaN |
0 |
0 |
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
alg=RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)
kf=model_selection.KFold(n_splits=3,shuffle=False, random_state=1)
scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
print(scores)
print(scores.mean())
[0.75420875 0.8013468 0.8013468 ]
0.7856341189674523
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
alg=RandomForestClassifier(random_state=1,n_estimators=30,min_samples_split=2,min_samples_leaf=1)
kf=model_selection.KFold(n_splits=10,shuffle=False,random_state=1)
scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
print(scores)
print(scores.mean())
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
[0.74444444 0.80898876 0.78651685 0.82022472 0.85393258 0.85393258
0.7752809 0.7752809 0.84269663 0.85393258]
0.8115230961298376