import urllib.request
import os
url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'
filepath = './data/titanic3.xls'
if not os.path.isfile(filepath):
result = urllib.request.urlretrieve(url, filepath)
print('downloaded:', result)
downloaded: ('./data/titanic3.xls', )
import numpy as np
import pandas as pd
all_df = pd.read_excel('./data/titanic3.xls')
cols = [
'survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked'
]
all_df = all_df[cols]
all_df[:2]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
0 |
1 |
Allen, Miss. Elisabeth Walton |
1 |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
1 |
1 |
Allison, Master. Hudson Trevor |
1 |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
df = all_df.drop(['name'], axis=1)
all_df.isnull().sum()
survived 0
name 0
pclass 0
sex 0
age 263
sibsp 0
parch 0
fare 1
embarked 2
dtype: int64
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female':0, 'male': 1}).astype(int)
x_Onehot_df = pd.get_dummies(data=df, columns=['embarked'])
x_Onehot_df[:2]
|
survived |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked_C |
embarked_Q |
embarked_S |
0 |
1 |
1 |
0 |
29.0000 |
0 |
0 |
211.3375 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
0.9167 |
1 |
2 |
151.5500 |
0 |
0 |
1 |
ndarray = x_Onehot_df.values
ndarray.shape
(1309, 10)
ndarray[:2]
array([[ 1. , 1. , 0. , 29. , 0. , 0. ,
211.3375, 0. , 0. , 1. ],
[ 1. , 1. , 1. , 0.9167, 1. , 2. ,
151.55 , 0. , 0. , 1. ]])
Label = ndarray[:,0]
Features = ndarray[:, 1:]
Label[:2]
array([1., 1.])
Features[:2]
array([[ 1. , 0. , 29. , 0. , 0. , 211.3375,
0. , 0. , 1. ],
[ 1. , 1. , 0.9167, 1. , 2. , 151.55 ,
0. , 0. , 1. ]])
from sklearn import preprocessing
minmax_Scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_Scale.fit_transform(Features)
scaledFeatures[:2]
array([[0. , 0. , 0.36116884, 0. , 0. ,
0.41250333, 0. , 0. , 1. ],
[0. , 1. , 0.00939458, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
msk = np.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]
print('total:', len(all_df), 'train:', len(train_df), 'test:', len(test_df))
total: 1309 train: 1071 test: 238
def PreprocessData(raw_df):
df = raw_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(age_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_Onehot_df = pd.get_dummies(data=df, columns=['embarked'])
ndarray = x_Onehot_df.values
Features = ndarray[:, 1:]
Label = ndarray[:, 0]
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(Features)
return scaledFeatures, Label
train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)
train_Features[:2]
array([[0. , 0. , 0.0229641 , 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ],
[0. , 1. , 0.37369494, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
test_Label[:2]
array([1., 1.])
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
model = Sequential()
model.add(
Dense(units=40,
input_dim=9,
kernel_initializer='uniform',
activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1,kernel_initializer='uniform', activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
W0819 11:23:43.940761 6100 deprecation_wrapper.py:119] From E:\Anaconda3\envs\ml\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.
W0819 11:23:43.970712 6100 deprecation_wrapper.py:119] From E:\Anaconda3\envs\ml\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.
W0819 11:23:43.976665 6100 deprecation.py:323] From E:\Anaconda3\envs\ml\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
train_history = model.fit(x=train_Features,
y=train_Label,
validation_split=0.1,
batch_size=30,
epochs=30,
verbose=2)
Train on 963 samples, validate on 108 samples
Epoch 1/30
- 0s - loss: 0.6645 - acc: 0.6023 - val_loss: 0.5840 - val_acc: 0.7685
Epoch 2/30
- 0s - loss: 0.6062 - acc: 0.6594 - val_loss: 0.4936 - val_acc: 0.7870
Epoch 3/30
- 0s - loss: 0.5513 - acc: 0.7487 - val_loss: 0.4564 - val_acc: 0.7870
Epoch 4/30
- 0s - loss: 0.5151 - acc: 0.7747 - val_loss: 0.4487 - val_acc: 0.8056
Epoch 5/30
- 0s - loss: 0.4968 - acc: 0.7757 - val_loss: 0.4538 - val_acc: 0.8056
Epoch 6/30
- 0s - loss: 0.4882 - acc: 0.7736 - val_loss: 0.4354 - val_acc: 0.8056
Epoch 7/30
- 0s - loss: 0.4839 - acc: 0.7695 - val_loss: 0.4277 - val_acc: 0.8148
Epoch 8/30
- 0s - loss: 0.4818 - acc: 0.7788 - val_loss: 0.4254 - val_acc: 0.8148
Epoch 9/30
- 0s - loss: 0.4796 - acc: 0.7840 - val_loss: 0.4231 - val_acc: 0.8333
Epoch 10/30
- 0s - loss: 0.4766 - acc: 0.7819 - val_loss: 0.4247 - val_acc: 0.8148
Epoch 11/30
- 0s - loss: 0.4733 - acc: 0.7830 - val_loss: 0.4240 - val_acc: 0.8148
Epoch 12/30
- 0s - loss: 0.4714 - acc: 0.7840 - val_loss: 0.4174 - val_acc: 0.8333
Epoch 13/30
- 0s - loss: 0.4684 - acc: 0.7871 - val_loss: 0.4181 - val_acc: 0.8426
Epoch 14/30
- 0s - loss: 0.4666 - acc: 0.7871 - val_loss: 0.4169 - val_acc: 0.8426
Epoch 15/30
- 0s - loss: 0.4643 - acc: 0.7892 - val_loss: 0.4151 - val_acc: 0.8519
Epoch 16/30
- 0s - loss: 0.4632 - acc: 0.7892 - val_loss: 0.4134 - val_acc: 0.8426
Epoch 17/30
- 0s - loss: 0.4618 - acc: 0.7902 - val_loss: 0.4133 - val_acc: 0.8426
Epoch 18/30
- 0s - loss: 0.4618 - acc: 0.7913 - val_loss: 0.4145 - val_acc: 0.8056
Epoch 19/30
- 0s - loss: 0.4606 - acc: 0.7944 - val_loss: 0.4160 - val_acc: 0.8426
Epoch 20/30
- 0s - loss: 0.4606 - acc: 0.7934 - val_loss: 0.4155 - val_acc: 0.8148
Epoch 21/30
- 0s - loss: 0.4588 - acc: 0.7944 - val_loss: 0.4124 - val_acc: 0.8426
Epoch 22/30
- 0s - loss: 0.4568 - acc: 0.7954 - val_loss: 0.4136 - val_acc: 0.8426
Epoch 23/30
- 0s - loss: 0.4571 - acc: 0.7985 - val_loss: 0.4152 - val_acc: 0.8333
Epoch 24/30
- 0s - loss: 0.4585 - acc: 0.7923 - val_loss: 0.4190 - val_acc: 0.8056
Epoch 25/30
- 0s - loss: 0.4577 - acc: 0.7923 - val_loss: 0.4162 - val_acc: 0.8426
Epoch 26/30
- 0s - loss: 0.4610 - acc: 0.7882 - val_loss: 0.4192 - val_acc: 0.8426
Epoch 27/30
- 0s - loss: 0.4553 - acc: 0.8006 - val_loss: 0.4156 - val_acc: 0.8333
Epoch 28/30
- 0s - loss: 0.4580 - acc: 0.7902 - val_loss: 0.4186 - val_acc: 0.7963
Epoch 29/30
- 0s - loss: 0.4590 - acc: 0.7975 - val_loss: 0.4145 - val_acc: 0.8426
Epoch 30/30
- 0s - loss: 0.4550 - acc: 0.7934 - val_loss: 0.4165 - val_acc: 0.8241
scores = model.evaluate(x=test_Features, y= test_Label)
238/238 [==============================] - 0s 21us/step
scores[1]
0.8025210089042407
Jack = pd.Series([0, 'Jack', 3, 'male', 23, 1, 0, 5.000, 'S'])
Rose = pd.Series([1, 'Rose', 1, 'female', 20, 1, 0, 100.000, 'S'])
JR_df = pd.DataFrame([list(Jack), list(Rose)],
columns=[
'survived', 'name', 'pclass', 'sex', 'age', 'sibsp',
'parch', 'fare', 'embarked'
])
all_df = pd.concat([all_df, JR_df])
all_df[~2:]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
1308 |
0 |
Zimmerman, Mr. Leo |
3 |
male |
29.0 |
0 |
0 |
7.875 |
S |
0 |
0 |
Jack |
3 |
male |
23.0 |
1 |
0 |
5.000 |
S |
1 |
1 |
Rose |
1 |
female |
20.0 |
1 |
0 |
100.000 |
S |
all_Features, Label = PreprocessData(all_df)
all_probability = model.predict(all_Features)
all_probability[:10]
array([[0.97387624],
[0.36760893],
[0.9653297 ],
[0.29578814],
[0.96136355],
[0.26288155],
[0.93404984],
[0.27685004],
[0.92254674],
[0.30783302]], dtype=float32)
pd = all_df
pd.insert(len(all_df.columns),
'probability', all_probability)
pd[~2:]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
probability |
1308 |
0 |
Zimmerman, Mr. Leo |
3 |
male |
29.0 |
0 |
0 |
7.875 |
S |
0.132631 |
0 |
0 |
Jack |
3 |
male |
23.0 |
1 |
0 |
5.000 |
S |
0.130663 |
1 |
1 |
Rose |
1 |
female |
20.0 |
1 |
0 |
100.000 |
S |
0.963028 |
pd[(pd['survived'] == 0) ]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
probability |
2 |
0 |
Allison, Miss. Helen Loraine |
1 |
female |
2.0 |
1 |
2 |
151.5500 |
S |
0.965330 |
3 |
0 |
Allison, Mr. Hudson Joshua Creighton |
1 |
male |
30.0 |
1 |
2 |
151.5500 |
S |
0.295788 |
4 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
1 |
female |
25.0 |
1 |
2 |
151.5500 |
S |
0.961364 |
7 |
0 |
Andrews, Mr. Thomas Jr |
1 |
male |
39.0 |
0 |
0 |
0.0000 |
S |
0.276850 |
9 |
0 |
Artagaveytia, Mr. Ramon |
1 |
male |
71.0 |
0 |
0 |
49.5042 |
C |
0.307833 |
10 |
0 |
Astor, Col. John Jacob |
1 |
male |
47.0 |
1 |
0 |
227.5250 |
C |
0.382211 |
15 |
0 |
Baumann, Mr. John D |
1 |
male |
NaN |
0 |
0 |
25.9250 |
S |
0.303370 |
16 |
0 |
Baxter, Mr. Quigg Edmond |
1 |
male |
24.0 |
0 |
1 |
247.5208 |
C |
0.568902 |
19 |
0 |
Beattie, Mr. Thomson |
1 |
male |
36.0 |
0 |
0 |
75.2417 |
C |
0.418435 |
25 |
0 |
Birnbaum, Mr. Jakob |
1 |
male |
25.0 |
0 |
0 |
26.0000 |
C |
0.446399 |
30 |
0 |
Blackwell, Mr. Stephen Weart |
1 |
male |
45.0 |
0 |
0 |
35.5000 |
S |
0.271255 |
34 |
0 |
Borebank, Mr. John James |
1 |
male |
42.0 |
0 |
0 |
26.5500 |
S |
0.275931 |
38 |
0 |
Brady, Mr. John Bertram |
1 |
male |
41.0 |
0 |
0 |
30.5000 |
S |
0.278998 |
39 |
0 |
Brandeis, Mr. Emil |
1 |
male |
48.0 |
0 |
0 |
50.4958 |
C |
0.364540 |
40 |
0 |
Brewe, Dr. Arthur Jackson |
1 |
male |
NaN |
0 |
0 |
39.6000 |
C |
0.429713 |
45 |
0 |
Butt, Major. Archibald Willingham |
1 |
male |
45.0 |
0 |
0 |
26.5500 |
S |
0.269356 |
46 |
0 |
Cairns, Mr. Alexander |
1 |
male |
NaN |
0 |
0 |
31.0000 |
S |
0.304525 |
51 |
0 |
Carlsson, Mr. Frans Olof |
1 |
male |
33.0 |
0 |
0 |
5.0000 |
S |
0.291429 |
52 |
0 |
Carrau, Mr. Francisco M |
1 |
male |
28.0 |
0 |
0 |
47.1000 |
S |
0.312617 |
53 |
0 |
Carrau, Mr. Jose Pedro |
1 |
male |
17.0 |
0 |
0 |
47.1000 |
S |
0.339315 |
58 |
0 |
Case, Mr. Howard Brown |
1 |
male |
49.0 |
0 |
0 |
26.0000 |
S |
0.260632 |
60 |
0 |
Cavendish, Mr. Tyrell William |
1 |
male |
36.0 |
1 |
0 |
78.8500 |
S |
0.271148 |
62 |
0 |
Chaffee, Mr. Herbert Fuller |
1 |
male |
46.0 |
1 |
0 |
61.1750 |
S |
0.246321 |
70 |
0 |
Chisholm, Mr. Roderick Robert Crispin |
1 |
male |
NaN |
0 |
0 |
0.0000 |
S |
0.297509 |
71 |
0 |
Clark, Mr. Walter Miller |
1 |
male |
27.0 |
1 |
0 |
136.7792 |
C |
0.426140 |
74 |
0 |
Clifford, Mr. George Quincy |
1 |
male |
NaN |
0 |
0 |
52.0000 |
S |
0.309330 |
75 |
0 |
Colley, Mr. Edward Pomeroy |
1 |
male |
47.0 |
0 |
0 |
25.5875 |
S |
0.264827 |
77 |
0 |
Compton, Mr. Alexander Taylor Jr |
1 |
male |
37.0 |
1 |
1 |
83.1583 |
C |
0.365659 |
80 |
0 |
Crafton, Mr. John Bertram |
1 |
male |
NaN |
0 |
0 |
26.5500 |
S |
0.303512 |
81 |
0 |
Crosby, Capt. Edward Gifford |
1 |
male |
70.0 |
1 |
1 |
71.0000 |
S |
0.200277 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1276 |
0 |
Vander Planke, Mrs. Julius (Emelia Maria Vande... |
3 |
female |
31.0 |
1 |
0 |
18.0000 |
S |
0.390119 |
1278 |
0 |
Vendel, Mr. Olof Edvin |
3 |
male |
20.0 |
0 |
0 |
7.8542 |
S |
0.144151 |
1279 |
0 |
Vestrom, Miss. Hulda Amanda Adolfina |
3 |
female |
14.0 |
0 |
0 |
7.8542 |
S |
0.541694 |
1280 |
0 |
Vovk, Mr. Janko |
3 |
male |
22.0 |
0 |
0 |
7.8958 |
S |
0.141521 |
1281 |
0 |
Waelens, Mr. Achille |
3 |
male |
22.0 |
0 |
0 |
9.0000 |
S |
0.141519 |
1282 |
0 |
Ware, Mr. Frederick |
3 |
male |
NaN |
0 |
0 |
8.0500 |
S |
0.131565 |
1283 |
0 |
Warren, Mr. Charles William |
3 |
male |
NaN |
0 |
0 |
7.5500 |
S |
0.131566 |
1284 |
0 |
Webber, Mr. James |
3 |
male |
NaN |
0 |
0 |
8.0500 |
S |
0.131565 |
1285 |
0 |
Wenzel, Mr. Linhart |
3 |
male |
32.5 |
0 |
0 |
9.5000 |
S |
0.128364 |
1287 |
0 |
Widegren, Mr. Carl/Charles Peter |
3 |
male |
51.0 |
0 |
0 |
7.7500 |
S |
0.107727 |
1288 |
0 |
Wiklund, Mr. Jakob Alfred |
3 |
male |
18.0 |
1 |
0 |
6.4958 |
S |
0.136882 |
1289 |
0 |
Wiklund, Mr. Karl Johan |
3 |
male |
21.0 |
1 |
0 |
6.4958 |
S |
0.133120 |
1291 |
0 |
Willer, Mr. Aaron ("Abi Weller") |
3 |
male |
NaN |
0 |
0 |
8.7125 |
S |
0.131564 |
1292 |
0 |
Willey, Mr. Edward |
3 |
male |
NaN |
0 |
0 |
7.5500 |
S |
0.131566 |
1293 |
0 |
Williams, Mr. Howard Hugh "Harry" |
3 |
male |
NaN |
0 |
0 |
8.0500 |
S |
0.131565 |
1294 |
0 |
Williams, Mr. Leslie |
3 |
male |
28.5 |
0 |
0 |
16.1000 |
S |
0.133237 |
1295 |
0 |
Windelov, Mr. Einar |
3 |
male |
21.0 |
0 |
0 |
7.2500 |
S |
0.142832 |
1296 |
0 |
Wirz, Mr. Albert |
3 |
male |
27.0 |
0 |
0 |
8.6625 |
S |
0.135120 |
1297 |
0 |
Wiseman, Mr. Phillippe |
3 |
male |
NaN |
0 |
0 |
7.2500 |
S |
0.131567 |
1298 |
0 |
Wittevrongel, Mr. Camille |
3 |
male |
36.0 |
0 |
0 |
9.5000 |
S |
0.124216 |
1299 |
0 |
Yasbeck, Mr. Antoni |
3 |
male |
27.0 |
1 |
0 |
14.4542 |
C |
0.161984 |
1301 |
0 |
Youseff, Mr. Gerious |
3 |
male |
45.5 |
0 |
0 |
7.2250 |
C |
0.147109 |
1302 |
0 |
Yousif, Mr. Wazli |
3 |
male |
NaN |
0 |
0 |
7.2250 |
C |
0.169266 |
1303 |
0 |
Yousseff, Mr. Gerious |
3 |
male |
NaN |
0 |
0 |
14.4583 |
C |
0.169295 |
1304 |
0 |
Zabour, Miss. Hileni |
3 |
female |
14.5 |
1 |
0 |
14.4542 |
C |
0.674486 |
1305 |
0 |
Zabour, Miss. Thamine |
3 |
female |
NaN |
1 |
0 |
14.4542 |
C |
0.603369 |
1306 |
0 |
Zakarian, Mr. Mapriededer |
3 |
male |
26.5 |
0 |
0 |
7.2250 |
C |
0.174369 |
1307 |
0 |
Zakarian, Mr. Ortin |
3 |
male |
27.0 |
0 |
0 |
7.2250 |
C |
0.173603 |
1308 |
0 |
Zimmerman, Mr. Leo |
3 |
male |
29.0 |
0 |
0 |
7.8750 |
S |
0.132631 |
0 |
0 |
Jack |
3 |
male |
23.0 |
1 |
0 |
5.0000 |
S |
0.130663 |
810 rows × 10 columns
pd[(pd['survived'] == 0) & (pd['probability'] > 0.9)]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
probability |
2 |
0 |
Allison, Miss. Helen Loraine |
1 |
female |
2.0 |
1 |
2 |
151.5500 |
S |
0.965330 |
4 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
1 |
female |
25.0 |
1 |
2 |
151.5500 |
S |
0.961364 |
105 |
0 |
Evans, Miss. Edith Corse |
1 |
female |
36.0 |
0 |
0 |
31.6792 |
C |
0.973539 |
169 |
0 |
Isham, Miss. Ann Elizabeth |
1 |
female |
50.0 |
0 |
0 |
28.7125 |
C |
0.971705 |
286 |
0 |
Straus, Mrs. Isidor (Rosalie Ida Blun) |
1 |
female |
63.0 |
1 |
0 |
221.7792 |
S |
0.954021 |
pd[:5]
|
survived |
name |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
probability |
0 |
1 |
Allen, Miss. Elisabeth Walton |
1 |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
0.973876 |
1 |
1 |
Allison, Master. Hudson Trevor |
1 |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
0.367609 |
2 |
0 |
Allison, Miss. Helen Loraine |
1 |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
0.965330 |
3 |
0 |
Allison, Mr. Hudson Joshua Creighton |
1 |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
0.295788 |
4 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
1 |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
0.961364 |