#!/usr/bin/env python
import pandas as pd, numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
df_train=pd.read_csv(‘train_data_regression.csv’,encoding=‘gbk’)
#df_test=pd.read_csv(‘test_data.csv’)
df_test=pd.read_csv(‘test_result.csv’)
#df_test_y=pd.read_csv(’’)
def clean_data(df):
df=df.replace(‘NIL’,np.nan)
df=df.replace(’/0’,np.nan)
df=df.replace(’ n u l l null null’,np.nan)
for c in [c for c in df.columns if c not in [‘SDATE’,‘小区名称’,‘WeakCoverage’]]:
df[c]=df[c].astype(np.float)
display(df.isnull().sum().to_frame().T)
df=df.fillna(df.mean()) # 使用均值填充
return df
df_train=clean_data(df_train)
df_test=clean_data(df_test)
X_columns=[c for c in df_train.columns if c not in [‘SDATE’,‘小区名称’,‘WeakCoverage’,‘RRC连接态最大用户数(个数)’]]
y_column=‘RRC连接态最大用户数(个数)’
x = df_train[[‘小区名称’]].values #X是(,1)维列向量
y = df_train[‘小区下行业务量(GB)’].values #y是(, )行向量
x
y
y[:, np.newaxis]
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()
train_X_ss = ss_X.fit_transform(df_train[X_columns])
test_X_ss = ss_X.transform(df_test[X_columns])
train_y_ss = ss_y.fit_transform(df_train[[y_column]]).flatten()
test_y_ss = ss_y.transform(df_test[[y_column]]).flatten()
#X_train_ss, X_test_ss, y_train, y_test=train_test_split(X_ss, df_train[y_column], test_size=0.25, random_state=1)
train_X_ss
train_y_ss
df_train.head(3)
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
print(“TensorFlow:{}\tKeras:{}”.format(tf.version,keras.version))
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title(‘Train History’)
plt.ylabel(train)
plt.xlabel(‘Epoch’)
plt.legend([‘train’, ‘validation’], loc=‘upper left’)
plt.show()
def get_sequental(layer_units,input_dim,output_dim,drop_out=0,drop_out_layer=1):
model=Sequential()
layer_1=Dense(units=layer_units,activation=‘relu’,input_dim=input_dim) # 其它参数用默认值
layer_2=Dense(units=output_dim) # 激活函数输出每列标签值
model.add(layer_1) #第一隐藏层
if drop_out>0:
model.add(Dropout(drop_out))
model.add(Dense(units=layer_units,activation=‘relu’)) #第二隐藏层
if drop_out_layer == 2:
model.add(Dropout(drop_out))
model.add(Dense(units=layer_units,activation=‘relu’)) #第三隐藏层
if drop_out_layer == 3:
model.add(Dropout(drop_out))
model.add(layer_2) #输出层
return model
model=get_sequental(len(X_columns)*2,len(X_columns),1, drop_out=0.5, drop_out_layer=2)
model.summary()
#model.compile(loss=‘categorical_crossentropy’, optimizer=‘adam’, metrics=[‘accuracy’])
model.compile(optimizer=‘adam’, loss=‘mean_squared_error’, metrics=[‘mae’])
train_history=model.fit(train_X_ss,train_y_ss,validation_split=0.2, epochs=250, batch_size=100)
train_history.history
show_train_history(train_history,‘loss’,‘val_loss’)
show_train_history(train_history,‘mae’,‘val_mae’)
score = model.evaluate(test_X_ss, test_y_ss, verbose=1)
print("\nTest score:", score[0])
print(‘Test accuracy:’, score[1])
score
y_pred_ss=model.predict(test_X_ss)
y_pred_ss
y_pred = ss_y.inverse_transform(y_pred_ss)
y_pred
rmse = sqrt(mean_squared_error(y_pred, df_test[y_column]))
print(‘Test RMSE: %.3f’ % rmse)