import pandas as pd
import numpy as np
from pylab import mpl
from datetime import datetime
import calendar
import seaborn as sn
import matplotlib.pyplot as plt
mpl.rcParams['font.sans-serif']=['SimHei']
def Data_Collect_and_Prosses():
bike_data=pd.read_csv('train.csv')
print(bike_data.shape)
print(bike_data.head(50))
print(bike_data.dtypes)
bike_data['date']=bike_data.datetime.apply(lambda x: x.split()[0])
bike_data['hour']=bike_data.datetime.apply(lambda x: x.split()[1].split(':')[0])
bike_data['weekday'] = bike_data.date.apply(
lambda dateString: calendar.day_name[datetime.strptime(dateString, '%Y/%m/%d').weekday()])
bike_data['month'] = bike_data.date.apply(
lambda dateString: calendar.month_name[datetime.strptime(dateString, "%Y/%m/%d").month])
bike_data['season'] = bike_data.season.map({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'})
print(bike_data)
varlist = ['hour', 'weekday', 'month', 'season', 'holiday', 'workingday']
for x in varlist:
bike_data[x] = bike_data[x].astype('category')
print(bike_data.dtypes)
bike_data.drop('datetime', axis=1, inplace=True)
print(bike_data.describe())
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_size_inches(12, 12)
sn.boxplot(data=bike_data, y='count', orient='v', ax=axes[0][0])
sn.boxplot(data=bike_data, x='season', y="count", orient='v', ax=axes[0][1])
sn.boxplot(data=bike_data, x='hour', y="count", orient='v', ax=axes[1][0])
sn.boxplot(data=bike_data, x='workingday', y="count", orient='v', ax=axes[1][1])
axes[0][0].set(ylabel="骑行人数", title="骑行人数")
axes[0][1].set(ylabel="骑行人数", xlabel="季节", title="各季节骑行人数")
axes[1][0].set(ylabel="骑行人数", xlabel="时间段", title="各时间段骑行人数")
axes[1][1].set(ylabel="骑行人数", xlabel="是否工作日", title="工作日和非工作日骑行人数")
plt.savefig('collect_and_process_data.png')
plt.show()
print(np.abs(bike_data["count"] - bike_data["count"].mean()))
print(3 * bike_data["count"].std())
print(np.abs(bike_data["count"] - bike_data["count"].mean()) <= (3 * bike_data["count"].std()))
processed_data = bike_data[np.abs(bike_data["count"] - bike_data["count"].mean()) <= (3 * bike_data["count"].std())]
print(processed_data)
processed_data.to_csv('processed_data.csv')
return processed_data
def Data_Analysis_and_Visualization_month(bike_data):
fig, axes = plt.subplots()
fig.set_size_inches(12, 20)
sortOrder = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October",
"November", "December"]
monthAggregated = pd.DataFrame(bike_data.groupby("month")["count"].mean()).reset_index()
monthSorted = monthAggregated.sort_values(by="count", ascending=True)
sn.barplot(data=monthSorted, x="month", y="count", order=sortOrder)
axes.set(xlabel="月份", ylabel="平均骑行人数", title="不同月份的骑行人数")
plt.savefig('result_month.png')
plt.show()
def Data_Analysis_and_Visualization_hour(bike_data):
fig, ax = plt.subplots()
fig.set_size_inches(12, 20)
hueOrder = ['Sunday', 'Monday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
hourAggregated = pd.DataFrame(bike_data.groupby(['hour', 'weekday'])['count'].mean()).reset_index()
print(hourAggregated)
sn.pointplot(x=hourAggregated['hour'], y=hourAggregated['count'],
hue=hourAggregated['weekday'], hue_order=hueOrder,data=hourAggregated)
ax.set(xlabel='时间', ylabel='骑行人数', title='一周内不同时间的骑行人数')
plt.savefig('result_hour.png')
plt.show()
def main():
p_data=Data_Collect_and_Prosses()
Data_Analysis_and_Visualization_month(p_data)
Data_Analysis_and_Visualization_hour(p_data)
if __name__ == '__main__':
main()