引入必要库
import csv
import tensorflow as tf
import numpy as np
import random
import sys
import pandas as pd
from pandas import DataFrame
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
读取源文件并打印
在这部分,我们接触了基本的csv操作,并显示结果。
我们读入kaggle上下载的train.csv文件,并展示内容
trainFilePath = './train.csv'
trainSize = 0
def testCSV(filePath):
with open(filePath, 'rb') as trainFile:
global trainSize
csvReader = csv.reader(trainFile)
dataList = [data for data in csvReader]
df = DataFrame(dataList[1:], columns=dataList[0])
trainSize = len(df)
print(df)
print("trainSize", trainSize)
testCSV(trainFilePath)
读取源文件并提取数据,建立神经网络
在这部分,我们读取源文件中的性别,阶级,船费以及SibSp,用于拟合最终的生存概率
然后我们建立一个总共5层,中间3层的神经网络,神经元的个数分别是4-10-20-10-2。
然后运行读取函数。
def readTrainDataCSV(filePath):
global trainData, targetData, classifier
with open(filePath, 'rb') as trainFile:
csvReader = csv.reader(trainFile)
dataList = [data for data in csvReader]
dataSize = len(dataList) - 1
trainData = np.ndarray((dataSize, 4), dtype=np.float32)
targetData = np.ndarray((dataSize, 1), dtype=np.int32)
trainDataFrame = DataFrame(dataList[1:], columns=dataList[0])
trainDataFrame_fliter = trainDataFrame.loc[:,['Pclass','Sex','SibSp','Fare','Survived']]
for i in range(dataSize):
thisData = np.array(trainDataFrame_fliter.iloc[i])
Pclass,Sex,SibSp,Fare,Survived = thisData
Pclass = float(Pclass)
Sex = 0 if Sex == 'female' else 1
SibSp = float(SibSp)
Fare = float(Fare)
Survived = int(Survived)
print(Pclass,Sex,SibSp,Fare,Survived)
trainData[i,:] = [Pclass,Sex,SibSp,Fare]
targetData[i,:] = [Survived]
print(thisData)
print(trainData)
print(targetData)
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=2)
# model_dir="/tmp/titanic_model")
readTrainDataCSV(trainFilePath)
创建输入数据
我们将训练数据和标签包装成一个二元组,并返回
def get_train_inputs():
x = tf.constant(trainData)
y = tf.constant(targetData)
print(x)
print(y)
return x, y
get_train_inputs()
训练数据
我们开始训练神经网络
def train():
classifier.fit(input_fn=get_train_inputs, steps=2000)
train()
检查准确度
我们使用整个数据集来查看准确度。注意,我们应该使用验证集来完成这件事。但是由于我们只是用来演示,所以就算了
accuracy_score = classifier.evaluate(input_fn=get_train_inputs,
steps=1)["accuracy"]
print("accuracy:",accuracy_score)
读入测试集,并输出结果
在这一部分,我们将读入kaggle中的数据,并输出到文件中,最终提交官网
testFilePath = './test.csv'
def readTestDataCSV(filePath):
global testData, PassengerIdStart
with open(filePath, 'rb') as testFile:
csvReader = csv.reader(testFile)
dataList = [data for data in csvReader]
dataSize = len(dataList)-1
trainDataFrame = DataFrame(dataList[1:], columns=dataList[0])
trainDataFrame_fliter = trainDataFrame.loc[:,['Pclass','Sex','SibSp','Fare']]
testData = np.ndarray((dataSize, 4), dtype=np.float32)
PassengerIdStart = trainDataFrame['PassengerId'][0]
PassengerIdStart = int(PassengerIdStart)
print('PassengerId',PassengerIdStart)
for i in range(dataSize):
thisData = np.array(trainDataFrame_fliter.iloc[i])
Pclass,Sex,SibSp,Fare = thisData
Pclass = float(Pclass)
Sex = 0 if Sex == 'female' else 1
SibSp = float(SibSp)
Fare = 0 if Fare=='' else float(Fare)
print(Pclass,Sex,SibSp,Fare)
testData[i,:] = [Pclass,Sex,SibSp,Fare]
print(thisData)
print(testData)
def testData_samples():
return testData
readTestDataCSV(testFilePath)
predictions = list(classifier.predict(input_fn=testData_samples))
print(predictions)
with open('predictions.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
writer.writerow(['PassengerId','Survived'])
PassengerId = PassengerIdStart
for i in predictions:
writer.writerow([PassengerId, i])
PassengerId += 1
最终在只使用了4个特征值的情况下,准确率有75%。接下来的目标是将其他数据进行利用。