✨作者主页:IT研究室✨
个人简介:曾从事计算机专业培训教学,擅长Java、Python、微信小程序、Golang、安卓Android等。接项目定制开发、代码讲解、答辩教学、文档编写、降重等。
☑文末获取源码☑
精彩专栏推荐⬇⬇⬇
Java项目
Python项目
安卓项目
微信小程序项目
#数据预处理
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ["SimHei"]#用来显示正常的中文
matplotlib.rcParams['axes.unicode_minus'] = False#显示负号
from sklearn.linear_model import Lasso,Ridge
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
df = pd.read_csv("副本SeoulBikeData.csv")
df1 = pd.read_csv("SeoulBikeData.csv")
print(df)
#处理日期类型
# 30/08/2021
# df['date']=df['Date'].apply(lambda c:c.split()[0])
df['date']= pd.to_datetime(df['Date'])
df['day'] = df['Date'].apply(lambda c:c.split("/")[2]).astype('int')
df['month'] = df['Date'].apply(lambda c:c.split("/")[1]).astype('int')
df['year'] = df['Date'].apply(lambda c:c.split("/")[0]).astype('int')
df.pop("Date")
df.info()
print(df.head())
# 处理数据
fig,(ax1,ax2) = plt.subplots(ncols=2) # plt包里的包 绘制子图 如果里面没有,默认绘制一个
fig.set_size_inches(12, 6)
sns.boxplot(data=df, y="Rented Bike Count", x="Seasons", orient="v", ax=ax1)
sns.boxplot(data=df, y="Rented Bike Count", x="Hour", orient="v", ax=ax2)
# 绘图
ax1.set(ylabel='骑行人数', xlabel='季节', title="不同季节骑行人数")
ax2.set(xlabel='时间', ylabel='骑行人数', title="一天不同时间骑行人数")
plt.show()
# 不同月份骑行人数
def Data_Analysis_and_Visualization_month(df):
fig1, ax1 = plt.subplots()
fig1.set_size_inches(12, 12)
sortOrder = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October",
"November", "December"]
monthAggregated = pd.DataFrame(df.groupby("month")["Rented Bike Count"].mean()).reset_index()
print(monthAggregated)
sns.barplot(data=monthAggregated,x=sortOrder,y=monthAggregated['Rented Bike Count'])
ax1.set(xlabel='月份', ylabel='平均骑行人数', title="不同月份骑行人数")
plt.show()
# 不同季节不同时间的骑行人数
def season_And_hour(bikedata1):
fig2, ax2 = plt.subplots()
fig2.set_size_inches(12, 8)
hueOrder = ['Spring', 'Summer', 'Autumn', 'Winter']
hourAggregated = pd.DataFrame(df.groupby(["Hour", "Seasons"])["Rented Bike Count"].mean()).reset_index()
sns.pointplot(x=hourAggregated["Hour"], y=hourAggregated["Rented Bike Count"], hue=hourAggregated["Seasons"], hue_order=hueOrder,
data=hourAggregated)
ax2.set(xlabel='时间', ylabel='骑行人数', title='不同季节不同时间的骑行人数')
plt.show()
def main():
Data_Analysis_and_Visualization_month(df)
season_And_hour(df)
# 主程序
if __name__ == '__main__':
main()
# 密度图
# 数值型变量:temp\Dew point temperature\Humidity\Wind speed
plt.rcParams['font.sans-serif'] = ["SimHei"]#用来显示正常的中文
matplotlib.rcParams['axes.unicode_minus'] = False#显示负号
fig=plt.figure()
fig.set_size_inches(8,8)
fig,axes=plt.subplots(2,2)
fig.set_size_inches(10,10)
sns.set_style('darkgrid')
sns.set_palette('RdBu')
ax1=sns.distplot(df['Temperature'])
ax2=sns.distplot(df['Dew point temperature'],ax=axes[0][0])
ax3=sns.distplot(df['Humidity(%)'],ax=axes[0][1])
ax4=sns.distplot(df['Wind speed (m/s)'],ax=axes[1][0])
plt.show()
# 查看各个变量与租车数量的关系
#创建一个figure对象
fig = plt.figure()
#add_subplot 对figure对象进行分割
ax1 = fig.add_subplot(3,3,1)
ax2 = fig.add_subplot(3,3,2)
ax3 = fig.add_subplot(3,3,3)
ax4 = fig.add_subplot(3,3,4)
ax5 = fig.add_subplot(3,3,5)
ax6 = fig.add_subplot(3,3,6)
ax7 = fig.add_subplot(3,3,7)
ax8 = fig.add_subplot(3,3,8)
ax9 = fig.add_subplot(3,3,9)
ax1=sns.regplot(x = "Hour",y = "Rented Bike Count",data=df,ax =ax1)
ax2=sns.regplot(x = "Temperature",y = "Rented Bike Count",data=df,ax =ax2)
ax3=sns.regplot(x = "Humidity(%)",y = "Rented Bike Count",data=df,ax =ax3)
ax4=sns.regplot(x = "Wind speed (m/s)",y = "Rented Bike Count",data=df,ax =ax4)
ax5=sns.regplot(x = "Visibility (10m)",y = "Rented Bike Count",data=df,ax =ax5)
ax6=sns.regplot(x = "Dew point temperature",y = "Rented Bike Count",data=df,ax =ax6)
ax7=sns.regplot(x = "Solar Radiation (MJ/m2)",y = "Rented Bike Count",data=df,ax =ax7)
ax8=sns.regplot(x = "Rainfall(mm)",y = "Rented Bike Count",data=df,ax =ax8)
ax9=sns.regplot(x = "Snowfall (cm)",y = "Rented Bike Count",data=df,ax =ax9)
sns.pairplot(df1,x_vars=['Seasons','Holiday','Functioning Day'],y_vars=['Rented Bike Count'],plot_kws={'alpha': 0.1})
plt.show()
# 查看各个变量与租车数量的相关性
# 相关系数矩阵
corr=df.corr()
corr
plt.rcParams['font.sans-serif'] = ["SimHei"]#用来显示正常的中文
matplotlib.rcParams['axes.unicode_minus'] = False#显示负号
# 看租赁数量与其他变量的相关关系
# 可以看到相关关系从大到小分别是:温度、时间、液化的温度、太阳辐射、能见度、风速、降雨量、降雪量、湿度
# 负相关的变量有:湿度、降雨量、降雪量
corr['Rented Bike Count'].sort_values(ascending=False)
# 相关系数热度图
fig=plt.figure(figsize=(12,12))
ax5=fig.add_subplot(1,1,1)
sns.heatmap(corr,ax=ax5,square=False,annot=True,cmap='RdBu',linewidths=0.5)
plt.title('Heatmap on Correlation',fontsize=30)
plt.show()
#
# 逐个变量研究
# 温度
# 数据按小时统计展示起来太麻烦,希望能够按天汇总取一天的气温中位数
plt.rcParams['font.sans-serif'] = ["SimHei"]#用来显示正常的中文
matplotlib.rcParams['axes.unicode_minus'] = False#显示负号
Temperaturedf = df.groupby(['date'],as_index=False).agg({'year':'mean','month':'mean','Temperature':'median'})
print("Temper2\n",Temperaturedf)
# 由于测试数据集中没有租赁信息,会导致折线图有断裂,所以将缺失的数据丢弃
Temperaturedf.dropna(axis=0,how='any',inplace=True)
# 将按天求和统计数据的日期转换成datetime格式
Temperaturedf['date']=pd.to_datetime(Temperaturedf['date'])
fig = plt.figure(figsize=(18,6))
ax = fig.add_subplot(1,1,1)
# 使用折线图展示总体用车情况,随时间的走势
# pd.set_option('display.max_index',None)
print("TEm\n",Temperaturedf['Temperature'])
print("date\n",Temperaturedf['date'].head(10000))
x_label = []
x_index = []
for i in range(0,len(Temperaturedf['date']), 30):
x_label.append(Temperaturedf['date'][i])
x_index.append(i)
plt.plot(Temperaturedf['Temperature'] , linewidth=1.3 , label='Daily average')
plt.xticks(x_index, x_label,color='blue',rotation=60)
ax.set_title('一年内日平均气温变化趋势')
# 按温度取用车数量平均值
Temperaturer=df.groupby(['Temperature'],as_index=True).agg({'Rented Bike Count':'mean'})
Temperaturer.plot(title = '租车量随温度的变化')
plt.show()
# 时段
fig=plt.figure()
sns.pointplot(x=df['Hour'],y=df['Rented Bike Count'])
Holidaydf=df[df['Holiday']=="Holiday"]
Holidaydf=Holidaydf.groupby(['Hour'],as_index=True).agg({'Rented Bike Count':'mean'})
nHolidaydf=df[df['Holiday']=="No Holiday"]
nHolidaydf=nHolidaydf.groupby(['Hour'],as_index=True).agg({'Rented Bike Count':'mean'})
fig,axes=plt.subplots(1,2,sharey=True)
Holidaydf.plot(figsize=(15,5),title = '非工作日内每小时发起的平均租赁次数',ax=axes[0])
nHolidaydf.plot(figsize=(15,5),title = '工作日内每小时发起的平均租赁次数',ax=axes[1])
# plt.show()
# 液化的温度
Dew=df.groupby(['Dew point temperature'],as_index=True).agg({'Rented Bike Count':'mean'})
Dew.plot(title = '租车量随液化的温度的变化')
# plt.show()
# 太阳辐射
Sol=df.groupby(['Solar Radiation (MJ/m2)'],as_index=True).agg({'Rented Bike Count':'mean'})
Sol.plot(title = '租车量随太阳辐射的变化')
plt.show()
# 能见度
Vis=df.groupby(['Visibility (10m)'],as_index=True).agg({'Rented Bike Count':'mean'})
Vis.plot(title = '租车量随能见度的变化')
# plt.show()
# 风速
# 考虑到风速特别大的时候很少,如果取平均值会出现异常,所以按风速对租赁数量取最大值。
windspeedr=df.groupby(['Wind speed (m/s)'],as_index=True).agg({'Rented Bike Count':'max'})
windspeedr.plot(title='不同风速下每小时启动的最大租赁次数')
# plt.show()
# 降雨量
Rain=df.groupby(['Rainfall(mm)'],as_index=True).agg({'Rented Bike Count':'mean'})
Rain.plot(title = '租车量随降雨量的变化')
# plt.show()
# 降雪量
Snow=df.groupby(['Snowfall (cm)'],as_index=True).agg({'Rented Bike Count':'mean'})
Snow.plot(title = '租车量随降雪量的变化')
# plt.show()
# 湿度
Dew=df.groupby(['Humidity(%)'],as_index=True).agg({'Rented Bike Count':'mean'})
Dew.plot(title = '租车量随湿度的变化')
plt.show()
# 工作日和节假日平均用车总量对比
weekdaydf= df.groupby(['Holiday'], as_index=True).agg({'Rented Bike Count':'mean'})
weekdaydf.plot.bar(stacked=True , title = '工作日和节假日平均用车总量对比')
plt.show()
# 年份、月份对租赁数量的影响
# 数据按小时统计展示起来太麻烦,希望能够按天汇总
countdf = df.groupby(['date'], as_index=False).agg({'year':'mean','month':'mean','Rented Bike Count':'sum'})
# 由于测试数据集中没有租赁信息,会导致折线图有断裂,所以将缺失的数据丢弃
countdf.dropna ( axis = 0 , how ='any', inplace = True )
# 将按天求和统计数据的日期转换成datetime格式
countdf['date']=pd.to_datetime(countdf['date'])
# 设置画框尺寸
fig = plt.figure(figsize=(18,6))
ax = fig.add_subplot(1,1,1)
x_label = []
x_index = []
for i in range(0,len(countdf['date']), 30):
x_label.append(countdf['date'][i])
x_index.append(i)
# 使用折线图展示总体用车随时间的走势
plt.plot(countdf['Rented Bike Count'] , linewidth=1.3 , label='Daily average')
ax.set_title('一年内每日总用车的变化趋势')
plt.xticks(x_index, x_label,color='blue',rotation=60)
plt.show()
【Python】基于Spark的共享单车数据存储系统
大家可以帮忙点赞、收藏、关注、评论啦~
源码获取:私信我
精彩专栏推荐⬇⬇⬇
Java项目
Python项目
安卓项目
微信小程序项目