import urllib.request
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Dropout
#下载数据集
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
#导入数据
all_df = pd.read_excel(filepath)
#将有关联的数据导入
cols = ['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df=all_df[cols]
#分训练集和测试集
msk = np.random.rand(len(all_df))<0.8
train_df = all_df[msk]
test_df = all_df[~msk]
print('total:',len(all_df),
'train:',len(train_df),
'test:',len(test_df))
#函数:数据处理
def PreprocessData(raw_df):
#去掉name字段
df = raw_df.drop(['name'],axis=1)
#对年龄、费用为空值的赋平均值
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(age_mean)
#对性别进行处理,male为1,female为0
df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int)
#对embarked(上船地点)进行OneHot-Coding
x_OneHot_df = pd.get_dummies(data=df,columns=['embarked'])
#将DataFrame转换成np的array
ndarray = x_OneHot_df.values
#将数据分为标签和特征
label = ndarray[:,0]
feature = ndarray[:,1:]
#对特征进行标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
scaledFeatures=minmax_scale.fit_transform(feature)
return scaledFeatures,label
#调用函数PreprocessData进行数据处理
train_feature, train_label = PreprocessData(train_df)
test_feature, test_label = PreprocessData(test_df)
#利用Keras构造多层感知机模型
model = Sequential()
model.add(Dense(units=10, input_dim=9,
kernel_initializer='uniform',
activation='relu'))
model.add(Dense(units=30,
kernel_initializer='uniform',
activation='relu'))
model.add(Dense(units=1,
kernel_initializer='uniform',
activation='sigmoid'))
#设定loss、优化器
model.compile(loss='binary_crossentropy',
optimizer='adam',metrics=['accuracy'])
#训练
train_history = model.fit(x=train_feature,
y=train_label,
validation_split=0.1,
epochs=30,
batch_size=30,verbose=2)
#调用show_train_history画图查看训练集和验证集的loss和准确率
#import show_train_history as sth
#sth.show_train_history(train_history,'acc','val_acc')
#sth.show_train_history(train_history,'loss','val_loss')
#使用验证集进行评分
scores = model.evaluate(x=test_feature,
y=test_label)
#利用模型预测Jack和Rose的生存率
Jack = pd.Series([0,'jack',3,'male',23,1,0,5.000,'S'])
Rose = pd.Series([1,'Rose',1,'female',20,1,0,100.000,'S'])
JR_df = pd.DataFrame([list(Jack),list(Rose)],
columns=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked'])
all_df = pd.concat([all_df,JR_df]) #合并数据
all_feature,all_label = PreprocessData(all_df) #对数据进行预处理
#在原始数据最末尾加上一列生存概率probability
pd = all_df
pd.insert(len(all_df.columns),'probability',all_probability)
pd[-2:] #Jack和Rose的数据在最后两个
#找出生存概率高但是没有存活的
pd[(pd['survived']==0)&(pd['probability']>0.9)]