本次教程介绍的是,利用python调用scikit-learn库的神经网络模型,进行时间序列预测。
不同于传统的机器学习模型,不需要特征,只需要连续时间内的target,就可以预测未来时间内的target
这个问题被成为时间序列预测问题,传统的方法是利用ARIMA或者SPSS。但是我觉得ARIMA对开发者要求比较高,经常出现预测效果不好的问题。
SPSS不适合进行批量预测,这个方法对开发者要求不高,而且预测效果也还可以。
这里的背景是预测2000个shop未来6周的销售量。训练数据是2015-7-1至2016-10-30的流量(天池IJICAI)
数据下载地址 https://pan.baidu.com/s/1miz8CrA
github参考 https://github.com/wangtuntun/IJCAI_nnet
这里分几个部分进行:
1 利用spark统计每个shop每天的购买量(数据量太大,而且不会用python的datagrame的group操作)
2 对数据进行清洗,保证每个shop每天都有flow。(时间序列处理要求每天都得有)
3 构建神经网络模型
4 调用模型进行预测
首先第一步 利用spark统计每个shop每天的购买量(数据量太大,而且不会用python的datagrame的group操作)
/**
* Created by wangtuntun on 17-3-4.
* 实现的主要功能是计算出每个商家每天的流量:(shop_id,DS,flow)
*/
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object clean {
def main(args: Array[String]) {
//设置环境
val conf=new SparkConf().setAppName("tianchi").setMaster("local")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val user_pay_raw=sc.textFile("/home/wangtuntun/IJCAI/Data/user_pay.txt")
val user_pay_split=user_pay_raw.map(_.split(","))
val user_transform =user_pay_split.map{ x=> //数据转换
val userid=x(0)
val shop_id=x(1)
val ts=x(2)
val ts_split=ts.split(" ")
val year_month_day=ts_split(0).split("-")
val year=year_month_day(0)
val month=year_month_day(1)
val day=year_month_day(2)
// (shop_id,userid,year,month,day)
(shop_id,userid,ts_split(0))
}
val df=sqc.createDataFrame(user_transform) // 生成一个dataframe
val df_name_colums=df.toDF("shop_id","userid","DS") //给df的每个列取名字
df_name_colums.registerTempTable("user_pay_table") //注册临时表
val sql="select shop_id ,count(userid),DS from user_pay_table group by shop_id,DS order by shop_id desc,DS"
val rs =sqc.sql(sql)
rs.foreach(x=>println(x))
// user_transform.saveAsTextFile("/home/wangtuntun/test_file4.txt")
val rs_rdd=rs.map( x => x(0) + ","+ x(2).toString + "," + x(1) ) //rs转为rdd
rs_rdd.coalesce(1,true).saveAsTextFile("/home/wangtuntun/ds_flow_raw_data.txt")
sc.stop();
}
}
第二步 对数据进行清洗,保证每个shop每天都有flow。(时间序列处理要求每天都得有)
# encoding=utf-8
'''
原始的id_date_flow数据是由dataframe.sql(groupby user_id)完成,不能保证所有用户的所有天数据都有,所以进行一次清洗和填充
'''
from datetime import timedelta
import datetime
start_time_str = "2015-07-01"
end_time_str = "2016-10-30"
start_time = datetime.datetime.strptime(start_time_str, '%Y-%m-%d')
end_time = datetime.datetime.strptime(end_time_str, '%Y-%m-%d')
flow_dict = {}
for shop_id in range(1, 2001):
for day in range(0, 487):
next_day = start_time + timedelta(days=day)
# print next_day.date()
flow_dict[(shop_id, next_day)] = 0
f = open("/home/wangtuntun/IJCAI/Data/id_date_flow", "r+")
raw_data = f.readlines()
f.close()
for ele in raw_data:
ele = ele.split(",")
shop_id = int(ele[0])
date_str = ele[1]
date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
flow = int(ele[2])
flow_dict[(shop_id, date)] = flow
#一次清洗
for ele in flow_dict:
# print ele,flow_dict[ele]
shop_id = ele[0]
date = ele[1]
date1 = date + timedelta(days=30)
date2 = date + timedelta(days=21)
date3 = date + timedelta(days=14)
date4 = date + timedelta(days=7)
date5 = date - timedelta(days=30)
date6 = date - timedelta(days=21)
date7 = date - timedelta(days=14)
date8 = date - timedelta(days=7)
if flow_dict[ele] == 0:
# 如果该shop改天的flow为0,则用未来三天的
if end_time - date > timedelta(days=30): # 如果不是截止日期的最后三天
if flow_dict[(shop_id, date1)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date1)]
continue
elif end_time - date > timedelta(days=21):
if flow_dict[(shop_id, date2)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date2)]
continue
elif end_time - date > timedelta(days=14):
if flow_dict[(shop_id, date3)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date3)]
continue
elif end_time - date > timedelta(days=7):
if flow_dict[(shop_id, date4)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date4)]
continue
else:
if flow_dict[(shop_id,date5)] != 0:
flow_dict[ele]=flow_dict[(shop_id,date5)]
elif flow_dict[(shop_id,date6)] != 0:
flow_dict[ele]=flow_dict[(shop_id,date6)]
elif flow_dict[(shop_id,date7)] != 0:
flow_dict[ele]=flow_dict[(shop_id,date7)]
elif flow_dict[(shop_id,date8)] != 0:
flow_dict[ele]=flow_dict[(shop_id,date8)]
#二次清洗
for ele in flow_dict:
# print ele,flow_dict[ele]
shop_id = ele[0]
date = ele[1]
# print flow_dict[(shop_id,date)]
date1 = date + timedelta(days=1)
date2 = date + timedelta(days=2)
date3 = date + timedelta(days=3)
date4 = date - timedelta(days=1)
date5 = date - timedelta(days=2)
date6 = date - timedelta(days=3)
if flow_dict[ele] == 0:
# 如果该shop改天的flow为0,则用未来三天的
if end_time - date > timedelta(days=3): # 如果不是截止日期的最后三天
if flow_dict[(shop_id, date1)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date1)]
continue
elif flow_dict[(shop_id, date2)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date2)]
continue
elif flow_dict[(shop_id, date3)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date3)]
continue
else:
if flow_dict[(shop_id, date4)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date4)]
continue
elif flow_dict[(shop_id, date5)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date5)]
continue
elif flow_dict[(shop_id, date6)] != 0:
flow_dict[ele] = flow_dict[(shop_id, date6)]
continue
# 如果未来连续三天都没有,则就下一个商家未来三天的信息
if shop_id != 2000: # 如果shop_id=2000,则+1就超出范围。
if flow_dict[(shop_id + 1, date1)] != 0:
flow_dict[ele] = flow_dict[(shop_id + 1, date1)]
continue
elif flow_dict[(shop_id + 1, date2)] != 0:
flow_dict[ele] = flow_dict[shop_id + 1, date2]
continue
elif flow_dict[(shop_id + 1, date3)] != 0:
flow_dict[ele] = flow_dict[(shop_id + 1, date3)]
continue
else:
if flow_dict[(shop_id - 1, date1)] != 0:
flow_dict[ele] = flow_dict[(shop_id - 1, date1)]
continue
elif flow_dict[(shop_id - 1, date2)] != 0:
flow_dict[ele] = flow_dict[(shop_id - 1, date2)]
continue
elif flow_dict[(shop_id - 1, date3)] != 0:
flow_dict[ele] = flow_dict[(shop_id - 1, date3)]
continue
#三次清洗
for ele in flow_dict:
# print ele,flow_dict[ele]
shop_id = ele[0]
date = ele[1]
import random
if flow_dict[ele] == 0:
random_int=random.randint(5, 50)
flow_dict[ele]=random_int
#将结果写入文件
f_write=open("/home/wangtuntun/IJCAI/Data/id_date_flow_cleaned","w+")
for ele in flow_dict:
shop_id=str(ele[0])
date=ele[1]
date_str=str(date.date())
flow=str(flow_dict[ele])
f_write.write(shop_id)
f_write.write(",")
f_write.write(date_str)
f_write.write(",")
f_write.write(flow)
f_write.write("\n")
f_write.close()
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
class TimeSeriesNnet(object):
def __init__(self, hidden_layers = [20, 15, 5], activation_functions = ['relu', 'relu', 'relu'],
optimizer = SGD(), loss = 'mean_absolute_error'):
self.hidden_layers = hidden_layers
self.activation_functions = activation_functions
self.optimizer = optimizer
self.loss = loss
if len(self.hidden_layers) != len(self.activation_functions):
raise Exception("hidden_layers size must match activation_functions size")
def fit(self, timeseries, lag = 7, epochs = 10000, verbose = 0):
self.timeseries = np.array(timeseries, dtype = "float64") # Apply log transformation por variance stationarity
self.lag = lag
self.n = len(timeseries)
if self.lag >= self.n:
raise ValueError("Lag is higher than length of the timeseries")
self.X = np.zeros((self.n - self.lag, self.lag), dtype = "float64")
self.y = np.log(self.timeseries[self.lag:])
self.epochs = epochs
self.scaler = StandardScaler()
self.verbose = verbose
logging.info("Building regressor matrix")
# Building X matrix
for i in range(0, self.n - lag):
self.X[i, :] = self.timeseries[range(i, i + lag)]
logging.info("Scaling data")
self.scaler.fit(self.X)
self.X = self.scaler.transform(self.X)
logging.info("Checking network consistency")
# Neural net architecture
self.nn = Sequential()
self.nn.add(Dense(self.hidden_layers[0], input_shape = (self.X.shape[1],)))
self.nn.add(Activation(self.activation_functions[0]))
for layer_size, activation_function in zip(self.hidden_layers[1:],self.activation_functions[1:]):
self.nn.add(Dense(layer_size))
self.nn.add(Activation(activation_function))
# Add final node
self.nn.add(Dense(1))
self.nn.add(Activation('linear'))
self.nn.compile(loss = self.loss, optimizer = self.optimizer)
logging.info("Training neural net")
# Train neural net
self.nn.fit(self.X, self.y, nb_epoch = self.epochs, verbose = self.verbose)
def predict_ahead(self, n_ahead = 1):
# Store predictions and predict iteratively
self.predictions = np.zeros(n_ahead)
for i in range(n_ahead):
self.current_x = self.timeseries[-self.lag:]
self.current_x = self.current_x.reshape((1, self.lag))
self.current_x = self.scaler.transform(self.current_x)
self.next_pred = self.nn.predict(self.current_x)
self.predictions[i] = np.exp(self.next_pred[0, 0])
self.timeseries = np.concatenate((self.timeseries, np.exp(self.next_pred[0,:])), axis = 0)
return self.predictions
#encoding=utf-8
from nnet_ts import *
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
start_time_str = "2015-07-01"
end_time_str = "2016-10-30"
start_time = datetime.datetime.strptime(start_time_str, '%Y-%m-%d')
end_time = datetime.datetime.strptime(end_time_str, '%Y-%m-%d')
#清洗数据
def clean_data(data_path):
#已经在clean_data.py文件中完成,并已经存入文件id_date_flow_cleaned
pass
#传入shop_id返回所有天的flow(天数连续且完整)
def get_shop_flows(shop_id,flow_dict):
return_list = []
for day in range(0, 488): # 训练集一共有488天
next_day = start_time + timedelta(days=day)
# print(shop_id,next_day.date(),flow_dict[(shop_id,next_day)])
return_list.append(flow_dict[(shop_id, next_day)])
return return_list
#将浮点list转换为一个int型的字符串
def float_list2str(float_list):
list_int = []
for ele in float_list:
ele = str(int(ele))
list_int.append(ele)
return ",".join(list_int)
#生成模型并预测接下来42天的flow
def predict_flow(time_series):
neural_net = TimeSeriesNnet(hidden_layers=[20, 15, 5],activation_functions=['sigmoid', 'sigmoid', 'sigmoid']) # 隐藏层每层的神经元格式以及激活函数
neural_net.fit(time_series, lag=40, epochs=10000) # epochs代表迭代次数
predictions=neural_net.predict_ahead(n_ahead = 42)#预测接下来30天的值,将结果以ndarray的格式返回
pre_loss=neural_net.loss#返回的是损失函数类型,不是误差。估计要自己定义计算误差的函数。
return neural_net,predictions,pre_loss
#图形化显示结果
def show_predicted_flow(neural_net,time_series):
plt.plot(range(len(neural_net.timeseries)), neural_net.timeseries, '-r', label='Predictions', linewidth=1)
plt.plot(range(len(time_series)), time_series, '-g', label='Original series')
plt.title("Box & Jenkins AirPassenger data")
plt.xlabel("Observation ordered index")
plt.ylabel("No. of passengers")
plt.legend()
plt.show()
if __name__ == "__main__":
# time_series = list(np.array(pd.read_csv("AirPassengers.csv")["x"]))#array list 都可以
# neural_net, predictions, pre_loss=predict_flow(time_series)
# show_predicted_flow(neural_net,time_series)
cleaned_data_path="/home/wangtuntun/IJCAI/Data/id_date_flow_cleaned"
f_open = open(cleaned_data_path, "r+")
raw_data = f_open.readlines()
f_open.close()
flow_dict = {}
for ele in raw_data:
ele = ele.split(",")
shop_id = int(ele[0])
time_str = ele[1]
date = datetime.datetime.strptime(time_str, '%Y-%m-%d')
flow = int(ele[2])
flow_dict[(shop_id, date)] = flow
result_path="/home/wangtuntun/IJCAI/Data/nnets_predicted_flow"
f_write=open(result_path,"w+")
# show_predicted_flow(neural_net,time_series)
for i in range(1,2001):
shop_flow_list = get_shop_flows(i, flow_dict)
time_series = shop_flow_list
neural_net, predictions, pre_loss = predict_flow(time_series)
predictions_str=float_list2str(predictions)
f_write.write(str(i))
f_write.write(",")
f_write.write(predictions_str)
f_write.write("\n")
f_write.close()