基于随机森林挖掘趵突泉景区气象对客流量影响


# coding: utf-8

# # 趵突泉景区客流量预测,模型使用随机森林
# 影响景区客流量的因素有很多,本例主要研究天气及节假日对景区客流量的影响
# 数据下载地址:https://download.csdn.net/download/chaochaopang0/10576437


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#get_ipython().magic('matplotlib inline')

### 导入数据集

df = pd.read_csv("baotuquan.csv", encoding="utf-8").drop("景区名称", axis=1).set_index("日期")


# ### 处理风速和降水量
# 由于风速字段是类别变量,我们需要将其转为float
# 
# 降水量数据较局限于某个时间段,因此我们将当日阴晴雨雪情况作为当日平均降水量情况


def map_wind_power(power):
    if power=="微":
        return 0.1
    elif power=="1-2级":
        return 1.8
    elif power=="2-3级":
        return 3.5
    elif power=="3-4级":
        return 5.6
    elif power=="4-5级":
        return 8.1
    elif power=="5-6级":
        return 10.9
    elif power=="6-7级":
        return 13.6
    elif power=="7-8级":
        return 16.8
    else:
        return 0

df["wind_speed"] = df["WIND_POWER"].map(map_wind_power)


def map_weather(weather):
    if weather=="晴":
        return 0.0
    elif weather=="多云":
        return 0.0
    elif weather=="阴":
        return 2
    elif weather=="小雨":
        return 5
    elif weather=="中雨":
        return 18
    elif weather=="大雨":
        return 75
    elif weather=="小雪":
        return 5
    elif weather=="中雪":
        return 20
    elif weather=="大雪":
        return 50
    else:
        return 0
df["rainfall"] = df["WEATHER"].map(map_weather)


# ### 数据集描述
# 展现数据集字段及分布情况


df.describe()


# ### 指定特征列

feature_cols = ["HIGHEST_TEMPERATURE","LOWEST_TEMPERATURE","AQI指数","rainfall","wind_speed"]


# ### 针对节假日情况分别对客流量字段进行标准化
# 表中节假日字段为2表示当日为长假中期或景区举办大型活动,1代表当日为节假日首末日或周末或举办小型活动,0代表当日为普通工作日
# 
# 需对这三种情况分别统计并加以分析,由于节假日会对客流量造成非常明显的影响,我们需要尽量消除这一影响



df_2 = df[df["节假日"]==2][feature_cols+["客流量"]]
df_1 = df[df["节假日"]==1][feature_cols+["客流量"]]
df_0 = df[df["节假日"]==0][feature_cols+["客流量"]]

df_2["客流量"] = (df_2["客流量"] - df_2["客流量"].mean())/df_2["客流量"].std()
df_1["客流量"] = (df_1["客流量"] - df_1["客流量"].mean())/df_1["客流量"].std()
df_0["客流量"] = (df_0["客流量"] - df_0["客流量"].mean())/df_0["客流量"].std()

print(df_2.describe())

df_1.describe()

df_0.describe()

df_eli = df_0.append(df_1).append(df_2).reindex(df.index)

df_eli.describe()


# ### 特征列标准化


df_train = df_eli.copy()
df_train[feature_cols] = (df_train[feature_cols] - df_train[feature_cols].mean())/df_train[feature_cols].std()


# ### 切分训练集测试集


x = df_train[feature_cols].as_matrix()
y = df_train["客流量"].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)


# ### 训练模型

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

rfr = RandomForestRegressor(n_estimators=300, n_jobs=-1, max_depth=12, max_features='sqrt')
model = rfr.fit(x_train, y_train)
print(model)


# ### 模型评估
model.feature_importances_

y_pred = model.predict(x)
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
mae = mean_absolute_error(y, y_pred)

print("r2: %s" % r2)
print("rmse: %s" % rmse)
print("mae: %s" % mae)

# ### 预测
# 在预测时需要根据节假日情况适当scale


holiday = df["节假日"].values
mu0 = df[df["节假日"]==0]["客流量"].mean()
mu1 = df[df["节假日"]==1]["客流量"].mean()
mu2 = df[df["节假日"]==2]["客流量"].mean()
lambda0 = df[df["节假日"]==0]["客流量"].std()
lambda1 = df[df["节假日"]==1]["客流量"].std()
lambda2 = df[df["节假日"]==2]["客流量"].std()
for i in range(len(holiday)):
    if holiday[i] == 0:
        y_pred[i] = y_pred[i]*lambda0 + mu0
    elif holiday[i] == 1:
        y_pred[i] = y_pred[i]*lambda1 + mu1
    elif holiday[i] == 2:
        y_pred[i] = y_pred[i]*lambda2 + mu2
df['predict'] = y_pred


plt.figure(figsize=(20,4))
plt.plot(range(df.index.shape[0]), df["客流量"].values)
plt.plot(range(df.index.shape[0]), y_pred,color="red")
plt.legend(["True Ground","Prediction"])


# 随着数据的丰富,模型准确率可以进一步提升





你可能感兴趣的:(机器学习)