Keras多层感知机预测泰坦尼克号旅客生存概率

import urllib.request
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Dropout

#下载数据集
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

#导入数据
all_df = pd.read_excel(filepath)
#将有关联的数据导入
cols = ['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df=all_df[cols]

#分训练集和测试集
msk = np.random.rand(len(all_df))<0.8
train_df = all_df[msk]
test_df = all_df[~msk]

print('total:',len(all_df),
     'train:',len(train_df),
     'test:',len(test_df))

#函数:数据处理
def PreprocessData(raw_df):
    #去掉name字段
    df = raw_df.drop(['name'],axis=1)

    #对年龄、费用为空值的赋平均值
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(age_mean)

    #对性别进行处理,male为1,female为0
    df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int)

    #对embarked(上船地点)进行OneHot-Coding
    x_OneHot_df = pd.get_dummies(data=df,columns=['embarked'])

    #将DataFrame转换成np的array
    ndarray = x_OneHot_df.values
    
    #将数据分为标签和特征
    label = ndarray[:,0]
    feature = ndarray[:,1:]

    #对特征进行标准化
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaledFeatures=minmax_scale.fit_transform(feature)
    
    return scaledFeatures,label

#调用函数PreprocessData进行数据处理
train_feature, train_label = PreprocessData(train_df)
test_feature, test_label = PreprocessData(test_df)

#利用Keras构造多层感知机模型
model = Sequential()
model.add(Dense(units=10, input_dim=9,
               kernel_initializer='uniform',
               activation='relu'))
model.add(Dense(units=30,
               kernel_initializer='uniform',
               activation='relu'))
model.add(Dense(units=1,
               kernel_initializer='uniform',
               activation='sigmoid'))

#设定loss、优化器
model.compile(loss='binary_crossentropy',
             optimizer='adam',metrics=['accuracy'])

#训练
train_history = model.fit(x=train_feature,
                          y=train_label,
                          validation_split=0.1,
                          epochs=30,
                          batch_size=30,verbose=2)

#调用show_train_history画图查看训练集和验证集的loss和准确率
#import show_train_history as sth
#sth.show_train_history(train_history,'acc','val_acc')
#sth.show_train_history(train_history,'loss','val_loss')

#使用验证集进行评分
scores = model.evaluate(x=test_feature,
                       y=test_label)

#利用模型预测Jack和Rose的生存率
Jack = pd.Series([0,'jack',3,'male',23,1,0,5.000,'S'])
Rose = pd.Series([1,'Rose',1,'female',20,1,0,100.000,'S'])
JR_df = pd.DataFrame([list(Jack),list(Rose)],
                    columns=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked'])
all_df = pd.concat([all_df,JR_df]) #合并数据
all_feature,all_label = PreprocessData(all_df) #对数据进行预处理

#在原始数据最末尾加上一列生存概率probability
pd = all_df
pd.insert(len(all_df.columns),'probability',all_probability)
pd[-2:] #Jack和Rose的数据在最后两个

#找出生存概率高但是没有存活的
pd[(pd['survived']==0)&(pd['probability']>0.9)]

 

你可能感兴趣的:(Keras)