赛题:https://tianchi.aliyun.com/competition/entrance/231633/information
原始jupyter文件,网盘: 链接:https://pan.baidu.com/s/1guIwAlk9zvI3ULcZ8kkBnw 提取码:g0xs
主要关注数据处理的方法,神经网络建立的方法,逻辑次要。
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns#安装anacoda后,利用anacoda的指令在安装的,见收藏夹kaggle中的帖子
import pylab
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from pandas import Series,DataFrame
from tensorflow.python.framework import ops
# read data from CSV file
#train_data是一个DataFrame
train_data = pd.read_csv('C:/Users/FrankFang/Desktop/TianChiDATA/Train.csv')
train_data.info()#输出读入的csv数据的基本信息
print('train_data({0[0]},{0[1]})'.format(train_data.shape))#第一行的标题不被计入总行数
#默认输出矩阵的前5行, 必须放在一个jupyter输入格子的最后一句,才能显示出表格,用print (data.head())则可以,但是没有表格的形式
train_data.head()
输出: train_data(500,8029)
ID | TOOL_ID | 210X1 | 210X2 | 210X3 | 210X4 | 210X5 | 210X6 | 210X7 | 210X8 | ... | 750X1444 | 750X1445 | 750X1446 | 750X1447 | 750X1448 | 750X1449 | 750X1450 | 750X1451 | 750X1452 | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ID001 | N | 102.05 | 0.465 | 0.27 | 1.430 | 67.45 | 4.620 | -0.54 | -1.05 | ... | 0.00072 | 0.00072 | 25.7 | 0.00072 | 0.00072 | 25.7 | 0.00072 | 0.00072 | 2.400000e+12 | 2.945079 |
1 | ID002 | M | 100.95 | 0.805 | 0.22 | 3.477 | 62.08 | 3.412 | -2.12 | 1.02 | ... | 0.00072 | 0.00072 | 25.5 | 0.00072 | 0.00072 | 25.5 | 0.00072 | 0.00072 | 2.400000e+12 | 2.955092 |
2 | ID003 | L | 98.56 | 0.555 | 0.24 | 1.172 | 56.70 | 3.080 | -2.25 | 0.88 | ... | 0.00064 | 0.00064 | 25.2 | 0.00064 | 0.00064 | 25.2 | 0.00064 | 0.00064 | 2.400000e+12 | 2.741264 |
3 | ID004 | M | 100.35 | 0.901 | 0.22 | 3.631 | 62.25 | 3.949 | -1.98 | 0.82 | ... | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 2.400000e+12 | 2.799336 |
4 | ID005 | M | 100.25 | 0.854 | 0.23 | 3.429 | 61.42 | 3.630 | -1.89 | 1.02 | ... | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 26.4 | 0.00072 | 0.00072 | 2.400000e+12 | 2.692093 |
5 rows × 8029 columns
#shutttle 各行数据
import random as rd # 导入random模块,使用里面的sample函数
train_data_tem=train_data
#随机排列所有的training set:
train_data_tem.sample(frac=1)
分析处理信息不完整的数据,如果某一个列中有空,则用该列的平均值来填补. 非数字类型的列: Tool,,Tool (#2), ,tool (#1), TOOL,TOOL (#1),TOOL (#2),, TOOL_ID,TOOL_ID (#1),TOOL_ID (#2),TOOL_ID (#3),
train_data["Tool"].loc[train_data["Tool"] == "A"] = 1.0
train_data["Tool"].loc[train_data["Tool"] == "B"] = -1.0
train_data["Tool (#2)"].loc[train_data["Tool (#2)"] == "A"] = 1.0
train_data["Tool (#2)"].loc[train_data["Tool (#2)"] == "B"] = 0.0
train_data["Tool (#2)"].loc[train_data["Tool (#2)"] == "C"] = -1.0
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "P"] = 1.2
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "Q"] = 1.0
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "R"] = 0.8
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "S"] = 0.6
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "T"] = 0.4
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "U"] = 0.2
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "V"] = 0.0
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "W"] = -0.2
train_data["tool (#1)"].loc[train_data["tool (#1)"] == "X"] = -0.4
train_data["TOOL"].loc[train_data["TOOL"] == "B"] = 1.0
train_data["TOOL"].loc[train_data["TOOL"] == "C"] = 0.0
train_data["TOOL"].loc[train_data["TOOL"] == "D"] = -1.0
train_data["TOOL (#1)"].loc[train_data["TOOL (#1)"] == "XY1"] = 1.0
train_data["TOOL (#1)"].loc[train_data["TOOL (#1)"] == "YX1"] = -1.0
train_data["TOOL (#2)"].loc[train_data["TOOL (#2)"] == "A"] = 1.0
train_data["TOOL (#2)"].loc[train_data["TOOL (#2)"] == "B"] = -1.0
#print((train_data["TOOL (#2)"][7]))
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "J"] = 1.0
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "K"] = 0.7
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "L"] = 0.4
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "M"] = 0.1
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "N"] = -0.2
train_data["TOOL_ID"].loc[train_data["TOOL_ID"] == "O"] = -0.5
train_data["TOOL_ID (#1)"].loc[train_data["TOOL_ID (#1)"] == "E"] = 1.0
train_data["TOOL_ID (#1)"].loc[train_data["TOOL_ID (#1)"] == "N"] = -1.0
train_data["TOOL_ID (#2)"].loc[train_data["TOOL_ID (#2)"] == "C"] = 1.0
train_data["TOOL_ID (#2)"].loc[train_data["TOOL_ID (#2)"] == "D"] = 0.0
train_data["TOOL_ID (#2)"].loc[train_data["TOOL_ID (#2)"] == "E"] = -1.0
train_data["TOOL_ID (#3)"].loc[train_data["TOOL_ID (#3)"] == "E0"] = 1.0
train_data["TOOL_ID (#3)"].loc[train_data["TOOL_ID (#3)"] == "N0"] = -1.0
for i in range(1,8028 ):#i=1,2,...,8027
train_data[ train_data.columns[i] ].fillna( train_data[ train_data.columns[i] ].median(), inplace=True )
#如果某一列中都为相同的数,则直接删除该列,如果是tool列不要删,因为后面还要使用“tool”检测tool列的列标
#
Dropped_columnNames=[]#保存被drop的列名字,同样需要对test data中的这些列进行drop
a=0
train_data_temp = train_data
for i in range(1,8028):
CloumnName = train_data.columns[i]
if (0 == train_data[CloumnName].max() - train_data[CloumnName].min() ) and (not("tool" in CloumnName.lower())):
if a !=0:#保留一列
train_data_temp = train_data_temp.drop([CloumnName], axis=1)
Dropped_columnNames.append(CloumnName)
a=a+1
#print(CloumnName)
#print(train_data[CloumnName].max() - train_data[CloumnName].min())
train_data = train_data_temp
print(a)
train_data_rows = train_data.shape[0]#返回列数,,shape[0]返回行数 #包括第一列的id和最后一列的Y
train_data_colums = train_data.shape[1]#
print(train_data_rows,train_data_colums)
#去掉train_data中的ID列
train_data = train_data.drop(["ID"], axis=1)
#计算各列(除了ID列)与Y列的相关系数
#
train_data.head()
train_data_coorY = train_data.corrwith(train_data["Y"]).abs()
train_data_coorY = train_data_coorY.sort_values(ascending = False)#从大到小排序,
#print(train_data_coorY)
#保存于Y列的相关系数排名前x的列:(注意train_data_coorY第一名为Y列本身)
Column_names_high_corr=[]
Num_selectd_colums=50
for i in range(1,Num_selectd_colums+1):
if not("tool" in train_data_coorY.index[i].lower()):
Column_names_high_corr.append(train_data_coorY.index[i])
#将所有的Tool列补上
for i in range(1,train_data_colums-1):
CloumnName = train_data.columns[i]
if "tool" in CloumnName.lower():
Column_names_high_corr.append(CloumnName)
Num_selectd_colums=len(Column_names_high_corr)
#将"Y"补放在最后
Column_names_high_corr.append("Y")
#仅仅留train_data中Column_names_high_corr[]中保存的列:
print(train_data.shape)
train_data = train_data[Column_names_high_corr]
print(train_data.shape)
train_data.head()
基于sklearn包:(前面已经导入)
m = PCA(n_components=0.9) #保留90%的数据信息
array_PCA = m.fit_transform(array) #array是原始矩阵,pandas的datafram。该步骤同时保留了这个pca模型。 array_PCA是压缩后的数据
#用上面的model来对新数据做PCA
NewArray_PCA = m.transform(NewArray)
#将training data set分成两个部分:(去掉最后一列(Y))
train_data_part1=train_data.iloc[0:int(train_data_rows*0.8),0:train_data_colums-1]
train_data_part2=train_data.iloc[int(train_data_rows*0.8):train_data_rows,0:train_data_colums-1]
#将train_data_part1,train_data_part2转换为numpy数组类型,方便后续使用numpy 的broad casting
train_data_part1 = np.array(train_data_part1)
train_data_part2 = np.array(train_data_part2)
#print(train_data_part1[0:2,0:3])
train_data_part1_rows = train_data_part1.shape[0]#
train_data_part1_colums = train_data_part1.shape[1]#
train_data_part2_rows = train_data_part2.shape[0]#
train_data_part2_colums = train_data_part2.shape[1]#
print(type(train_data_part1))
print(train_data_part1_rows,train_data_part1_colums)
print(train_data_part2_rows,train_data_part2_colums)
#求train_data_part1_PCA各列的均值和最大值,最小值,并保存..train_data_PCA不包括ID列和Y列
train_data_part1_mean = train_data_part1.mean(axis=0).reshape(1,train_data_part1_colums)#数组按照列求平均
train_data_part1_max = train_data_part1.max(axis=0).reshape(1,train_data_part1_colums)#数组按照列求max
train_data_part1_min = train_data_part1.min(axis=0).reshape(1,train_data_part1_colums)#数组按照列求min
train_data_part1_maxMinumMin = train_data_part1_max-train_data_part1_min#求各列的max-min值并保存
train_data_part1_maxMinumMin = train_data_part1_maxMinumMin + 0.0001#防止为零
train_data_part1_std = train_data_part1.std(axis=0).reshape(1,train_data_part1_colums)+0.0001#数组按照列求std
print(train_data_part1_mean.shape)
#对各列进行rescale: x=(x-mean)/(max-min)
print(train_data_part1_mean.shape)
print(type(train_data_part1))
train_data_part1 = train_data_part1 - train_data_part1_mean###np array类型的才能broad casting!!!!
#train_data_part1 = train_data_part1 / train_data_part1_maxMinumMin#利用python的broadcasting
train_data_part1 = train_data_part1 / train_data_part1_std
print(train_data_part1.shape)
train_data_part2 = train_data_part2 - train_data_part1_mean###np array类型的才能broad casting!!!!
#train_data_part2 = train_data_part2 / train_data_part1_maxMinumMin#利用python的broadcasting
train_data_part2 = train_data_part2 / train_data_part1_std
print(train_data_part2.shape)
大体思路是,先建立起来前向传播公式,公式中参与运算的是tensorflow的placeholder。 然后,在实际训练中用实际的数据来替换掉placeholder。
def create_placeholders(Feature_Num):
#定义placeholder
x = tf.placeholder('float', shape=[None, Feature_Num])
# labels
y = tf.placeholder('float', shape=[None, 1])
#keep_prob用来表示神经元的输出概率
keep_prob=tf.placeholder(tf.float32)
keep_learnRate=tf.placeholder(tf.float32)
return x,y,keep_prob,keep_learnRate
def initialize_parameters(Feature_Num,Layer1_NodeNum,Layer2_NodeNum,Layer3_NodeNum,Layer4_NodeNum,Layer5_NodeNum,Layer6_NodeNum):
"""
Initializes weight parameters to build a neural network with tensorflow. The shapes are:
Returns:
parameters -- a dictionary of tensors containing W1, W2
"""
#print("initialize_parameters() is called!")
#print(Feature_Num,Layer1_NodeNum,Layer2_NodeNum,Layer3_NodeNum)
# W1 = tf.get_variable("W1", [Feature_Num,Layer1_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# W2 = tf.get_variable("W2", [Layer1_NodeNum,Layer2_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# W3 = tf.get_variable("W3", [Layer2_NodeNum,Layer3_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# W4 = tf.get_variable("W4", [Layer3_NodeNum,Layer4_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# Bias1 = tf.get_variable("Bias1", [Layer1_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# Bias2 = tf.get_variable("Bias2", [Layer2_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# Bias3 = tf.get_variable("Bias3", [Layer3_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
# Bias4 = tf.get_variable("Bias4", [Layer4_NodeNum], initializer = tf.contrib.layers.xavier_initializer(seed = 0))
tf.set_random_seed(1)
W1 =tf.Variable(tf.truncated_normal([Feature_Num,Layer1_NodeNum], stddev=0.1))
W2 =tf.Variable(tf.truncated_normal([Layer1_NodeNum,Layer2_NodeNum], stddev=0.1))
W3 =tf.Variable(tf.truncated_normal([Layer2_NodeNum,Layer3_NodeNum], stddev=0.1))
W4 =tf.Variable(tf.truncated_normal([Layer3_NodeNum,Layer4_NodeNum], stddev=0.1))
W5 =tf.Variable(tf.truncated_normal([Layer4_NodeNum,Layer5_NodeNum], stddev=0.1))
W6 =tf.Variable(tf.truncated_normal([Layer5_NodeNum,Layer6_NodeNum], stddev=0.1))
Bias1 =tf.Variable(tf.truncated_normal([Layer1_NodeNum], stddev=0.1))
Bias2 =tf.Variable(tf.truncated_normal([Layer2_NodeNum], stddev=0.1))
Bias3 =tf.Variable(tf.truncated_normal([Layer3_NodeNum], stddev=0.1))
Bias4 =tf.Variable(tf.truncated_normal([Layer4_NodeNum], stddev=0.1))
Bias5 =tf.Variable(tf.truncated_normal([Layer5_NodeNum], stddev=0.1))
Bias6 =tf.Variable(tf.truncated_normal([Layer6_NodeNum], stddev=0.1))
parameters = {"W1": W1,"W2": W2,"W3": W3,"W4": W4,"W5": W5,"W6": W6,"Bias1": Bias1,"Bias2": Bias2,"Bias3": Bias3,"Bias4": Bias4,"Bias5": Bias5,"Bias6": Bias6}
return parameters
入参:两个placeHolder,还有parameters; 返回预测值
def forward_propagation(x, parameters,keep_prob):
"""
Arguments:
x -- input dataset placeholder, of shape (input size, FeatureNum)
parameters -- python dictionary containing your parameters "W1", "W2"..
the shapes are given in initialize_parameters
keep_prob--dropout related
Returns:
Z3 -- the output of the last LINEAR unit
"""
# Retrieve the parameters from the dictionary "parameters"
W1 = parameters['W1']
W2 = parameters['W2']
W3 = parameters['W3']
W4 = parameters['W4']
W5 = parameters['W5']
W6 = parameters['W6']
Bias1 = parameters["Bias1"]
Bias2 = parameters["Bias2"]
Bias3 = parameters["Bias3"]
Bias4 = parameters["Bias4"]
Bias5 = parameters["Bias5"]
Bias6 = parameters["Bias6"]
Z1=tf.matmul(x,W1) + Bias1#x(?,1) w1(1,12)
#A1=tf.nn.tanh(Z1)
#A1=tf.nn.relu(Z1)
A1 = Z1
A1_drop=tf.nn.dropout(A1,keep_prob)
Z2=tf.matmul(A1_drop,W2) + Bias2#A1_drop(?,12) w2(12,1)
#A2=tf.nn.tanh(Z2)
#A2=tf.nn.relu(Z2)
A2 = Z2
A2_drop=tf.nn.dropout(A2,keep_prob)
Z3=tf.matmul(A2_drop,W3) + Bias3
A3=Z3
#A3=tf.nn.tanh(Z3)
#A3=tf.nn.relu(Z3)
A3_drop=tf.nn.dropout(A3,keep_prob)
Z4=tf.matmul(A3_drop,W4) + Bias4
A4=Z4
#A4=tf.nn.relu(Z4)
#A4=tf.nn.tanh(Z4)
A4_drop=tf.nn.dropout(A4,keep_prob)
Z5=tf.matmul(A4_drop,W5) + Bias5
A5=Z5
#A5=tf.nn.relu(Z5)
#A5=tf.nn.tanh(Z5)
A5_drop=tf.nn.dropout(A5,keep_prob)
Z6=tf.matmul(A5_drop,W6) + Bias6
A6=Z6
#A6=tf.nn.relu(Z6)
return A6
入参:预测值和实际的y值,都是inputsize*1的placeholder
返回:数值
def compute_cost(predict,y):
cost= tf.reduce_mean( tf.square(y-predict))
#cost= tf.reduce_mean( predict-predict)
#cost= 0
return cost
运行这个model()函数就可以得到一个模型,入参中的learning_rate, num_epochs,minibatch_size都是可以调节的参数,可以手动调节,也可以另外写一个函数,自动用不同的入参去调用model函数,寻找更好的参数。
该函数可以将训练过程中的cost打印出来,观察训练过程。
def model(X_train, Y_train, X_test, Y_test,X_NeedPredi, learning_rate = 0.8,
num_epochs = 10, minibatch_size = 10, print_cost = True):
#适用于X为二维数据,一行代表一个sample,Y为一列
#X_test, Y_test用于对训练出的model进行测试
#ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables
tf.reset_default_graph()
(X_train_rows, X_train_columns) = X_train.shape
(Y_train_rows, Y_train_columns) = Y_train.shape
assert(X_train_rows == Y_train_rows)
(X_test_rows, X_test_columns) = X_test.shape
(Y_test_rows, Y_test_columns) = Y_test.shape
assert(X_test_rows == Y_test_rows)
assert(X_train_columns == X_test_columns)
(X_NeedPredi_rows, X_NeedPredi_columns) = X_NeedPredi.shape
assert(X_train_columns == X_NeedPredi_columns)
#########################################################
mini_batch_num = int(X_train_rows/minibatch_size)
#########################################################
Feature_Num = X_train_columns
x, y, keep_prob,keep_learnRate = create_placeholders(Feature_Num)#Feature_Num
#parameters = initialize_parameters(Feature_Num,200,100,80,50,25,1)
parameters = initialize_parameters(Feature_Num,50,50,20,10,5,1)
A3 = forward_propagation(x, parameters,keep_prob)
predict = A3
cost = compute_cost(predict, y)
cost_v2 = compute_cost_v2(predict, y)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
#########################################################
cost_validations = []
cost_stepValues=[]
x_range = []
########################################################
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)#在session开始时,需要初始化前面的各个函数创建的变量
#print("W1 = " + str(parameters["W1"].eval()))
for epoch in range(num_epochs):
New_learnRate = learning_rate*(0.99**epoch)
for i in range(mini_batch_num):
batch_xs = X_train[i*minibatch_size : (i+1)*minibatch_size , 0:Feature_Num]
batch_ys = Y_train[i*minibatch_size : (i+1)*minibatch_size]
if print_cost == True and (i%200) == 0:
validation_cost = cost.eval( feed_dict={ x:X_test, y:Y_test,keep_prob:1.0 } )
cost_current = cost.eval( feed_dict={ x:batch_xs,y:batch_ys,keep_prob:1.0 } )
cost_validations.append(validation_cost)
cost_stepValues.append(cost_current)
x_range.append(i + epoch*mini_batch_num)
#print("Iter= " + str(i+epoch*mini_batch_num) + ",Current training batech's cost= " + str(cost_current) + ",cost_validation= " + str(validation_cost))
sess.run(optimizer,feed_dict={x:batch_xs,y:batch_ys,keep_prob:0.2,keep_learnRate:New_learnRate})
# print("W1 = " + str(parameters["W1"].eval()))
# print("Bias1 = " + str(parameters["Bias1"].eval()))
# print("W2 = " + str(parameters["W2"].eval()))
# print("Bias2 = " + str(parameters["Bias2"].eval()))
# print("W3 = " + str(parameters["W3"].eval()))
# print("Bias3 = " + str(parameters["Bias3"].eval()))
if print_cost == True :
plt.plot(x_range, cost_validations,'-b', label='validation set')
plt.plot(x_range, cost_stepValues,'-r', label='setp cost')
plt.legend(loc='lower right', frameon=False)
plt.ylim(ymax = 0.6, ymin = 0)
plt.ylabel('cost')
plt.xlabel('step')
#plt.plot(x_range, cost_values,'-b')
plt.show()
#############利用已经训练出来的模型对X_test数据进行预测!并将预测结果与Y_test比较,计算出cost,以进行评估。太激动了#################################
predict_Y_test = np.zeros( X_test_rows ).reshape( X_test_rows,1 )
minibatch_size=10
for i in range( int(X_test_rows/minibatch_size) ): #对X_test数据进行预测
#print(i)
#print(X_NeedPredi[i*1 : (i+1)*1,0:Feature_Num])
predict_Y_test[i*minibatch_size : (i+1)*minibatch_size] = predict.eval(feed_dict={x: X_test[i*minibatch_size : (i+1)*minibatch_size,0:Feature_Num],keep_prob: 1.0})
#print('predict_testA({0})'.format(len(predict_test)))
cost_predict_Y_test = cost.eval( feed_dict={ x:X_test,y:Y_test,keep_prob:1.0 } )
#print("learning_rate="+str(learning_rate)+"minibatch_size="+str(minibatch_size)+"cost_predict_Y_test=")
print(cost_predict_Y_test)
#print(predict_Y_test)
############利用模型对没有label的数据进行预测##############################
# predict_Y = np.zeros( X_NeedPredi_rows ).reshape( X_NeedPredi_rows,1 )
# minibatch_size=10
# for i in range( int(X_NeedPredi_rows/minibatch_size) ):
# #print(i)
# #print(X_NeedPredi[i*1 : (i+1)*1,0:Feature_Num])
# predict_Y[i*minibatch_size : (i+1)*minibatch_size] = predict.eval(feed_dict={x: X_NeedPredi[i*minibatch_size : (i+1)*minibatch_size,0:Feature_Num],keep_prob: 1.0})
# print('predict_Y({0})'.format(len(predict_Y)))
# print("predicted labels for the unbled data:predict_Y=")
# print(predict_Y)
return cost_predict_Y_test
#return cost_predict_Y_test