使用线性回归、随机森林处理共享单车预测问题,数据集中包含了缺失值,我们使用随机森林先对缺失值进行预测,并且该数据集是和时间序列相关,但是我们将时间特征拆分为多个特征,将问题转换为了回归预测问题。
kaggle数据集介绍:https://www.kaggle.com/competitions/bike-sharing-demand/code?competitionId=3948&sortBy=voteCount
特征:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
import calendar
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)
import matplotlib.pyplot as plt
import seaborn as sn
data = pd.read_csv("train.csv")
# 相关性分析
corrMatt = data[["temp", "atemp", "casual", "registered", "humidity", "windspeed", "count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
sn.heatmap(corrMatt, mask=mask, vmax=.8, square=True, annot=True)
plt.show()
data["date"] = data.datetime.apply(lambda x : x.split()[0])
data["hour"] = data.datetime.apply(lambda x : x.split()[1].split(":")[0])
data["weekday"] = data.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
data["month"] = data.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])
data["season"] = data.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4)
fig.set_size_inches(12, 20)
sortOrder = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October",
"November", "December"]
hueOrder = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
monthAggregated = pd.DataFrame(data.groupby("month")["count"].mean()).reset_index()
monthSorted = monthAggregated.sort_values(by="count", ascending=False)
sn.barplot(data=monthSorted, x="month", y="count", ax=ax1, order=sortOrder)
ax1.set(xlabel='Month', ylabel='Avearage Count', title="Average Count By Month")
hourAggregated = pd.DataFrame(data.groupby(["hour", "season"], sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"], hue=hourAggregated["season"], data=hourAggregated,
join=True, ax=ax2)
ax2.set(xlabel='Hour Of The Day', ylabel='Users Count', title="Average Users Count By Hour Of The Day Across Season",
label='big')
hourAggregated = pd.DataFrame(data.groupby(["hour", "weekday"], sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"], hue=hourAggregated["weekday"], hue_order=hueOrder,
data=hourAggregated, join=True, ax=ax3)
ax3.set(xlabel='Hour Of The Day', ylabel='Users Count', title="Average Users Count By Hour Of The Day Across Weekdays",
label='big')
hourTransformed = pd.melt(data[["hour", "casual", "registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hourAggregated = pd.DataFrame(hourTransformed.groupby(["hour", "variable"], sort=True)["value"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["value"], hue=hourAggregated["variable"],
hue_order=["casual", "registered"], data=hourAggregated, join=True, ax=ax4)
ax4.set(xlabel='Hour Of The Day', ylabel='Users Count', title="Average Users Count By Hour Of The Day Across User Type",
label='big')
plt.show()
data = pd.read_csv("train.csv")
# 日期数据拆分
data["date"] = data.datetime.apply(lambda x: x.split()[0])
data["hour"] = data.datetime.apply(lambda x: x.split()[1].split(":")[0]).astype("int")
data["year"] = data.datetime.apply(lambda x: x.split()[0].split("-")[0])
data["weekday"] = data.date.apply(lambda dateString: datetime.strptime(dateString, "%Y-%m-%d").weekday())
data["month"] = data.date.apply(lambda dateString: datetime.strptime(dateString, "%Y-%m-%d").month)
print(len(data[data["windspeed"] == 0]))
print(len(data[data["windspeed"] != 0]))
# 2 填充数据,Windspeed有好多0列,我们使用随机森林进行预测,将预测值进行填充
from sklearn.ensemble import RandomForestRegressor
dataWind0 = data[data["windspeed"] == 0]
dataWindNot0 = data[data["windspeed"] != 0]
rfModel_wind = RandomForestRegressor()
windColumns = ["season", "weather", "humidity", "month", "temp", "year", "atemp"]
rfModel_wind.fit(dataWindNot0[windColumns], dataWindNot0["windspeed"])
wind0Values = rfModel_wind.predict(X=dataWind0[windColumns])
dataWind0["windspeed"] = wind0Values
data = dataWindNot0.append(dataWind0)
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
print(len(data[data["windspeed"] == 0]))
print(len(data[data["windspeed"] != 0]))
原来0值的数量:1313
现在0值的数量:0
categoricalFeatureNames = ["season", "holiday", "workingday", "weather", "weekday", "month", "year", "hour"]
for var in categoricalFeatureNames:
data[var] = data[var].astype("category")
yLabels = data["count"]
dropFeatures = ['casual', "count", "datetime", "date", "registered"]
data = data.drop(dropFeatures, axis=1)
def rmsle(y, y_, convertExp=True):
if convertExp:
y = np.exp(y),
y_ = np.exp(y_)
log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
calc = (log1 - log2) ** 2
return np.sqrt(np.mean(calc))
from sklearn.linear_model import LinearRegression
lModel = LinearRegression()
yLabelsLog = np.log1p(yLabels)
lModel.fit(X=data, y=yLabelsLog)
preds = lModel.predict(X=data)
print("RMSLE Value For Linear Regression: ", rmsle(np.exp(yLabelsLog), np.exp(preds), False))
from sklearn.ensemble import RandomForestRegressor
rfModel = RandomForestRegressor(n_estimators=100)
yLabelsLog = np.log1p(yLabels)
rfModel.fit(data, yLabelsLog)
preds = rfModel.predict(X=data)
print("RMSLE Value For Random Forest: ", rmsle(np.exp(yLabelsLog), np.exp(preds), False))
RMSLE Value For Linear Regression: 0.9779011300976505
RMSLE Value For Random Forest: 0.1025119847901103
通过结果我们看出随机森林效果很好。
待完善:超参数调优。
关注公众号:AI学习部 ,获取更多数据分析相关知识。