代码
import os
import numpy as np
import matplotlib.pyplot as plt
data_path = '/Users/miraco/PycharmProjects/DataMining/bikeshare'
data_filenames = ['2017-q1_trip_history_data.csv', '2017-q2_trip_history_data.csv',
'2017-q3_trip_history_data.csv','2017-q4_trip_history_data.csv']
#结果保存路径
output_path = './bikeshare/output'
if not os.path.exists(output_path): #如果不存在就新建一个
os.makedirs(output_path)
hist_range = (0,180)
n_bins = 12
def collect_and_process_data():
duration_member_all_col_list = []
for filename in data_filenames:
file = os.path.join(data_path,filename)
data = np.loadtxt(file,delimiter=',',dtype= 'str',skiprows=1)
member_data = np.core.defchararray.replace(data[:,-1],'"','').reshape(-1,1)
duration_data = np.core.defchararray.replace(data[:, 0], '"', '').reshape(-1, 1)
duration_member_all_col_list.append(np.concatenate([duration_data,member_data],axis=1) ) #横向拼接,放进列表
duration_member_all_col = np.concatenate(duration_member_all_col_list)
member_data = duration_member_all_col[duration_member_all_col[:,1]=='Member']
casual_data = duration_member_all_col[duration_member_all_col[:, 1] == 'Casual']
year_member_duration = member_data[:,0].astype('float') /1000 /60
year_casual_duration = casual_data[:, 0].astype('float') / 1000 / 60
return year_member_duration, year_casual_duration
def analyze_data(year_member_duration, year_casual_duration):
m_hist, m_bin_edges = np.histogram(year_member_duration, range= hist_range, bins = n_bins)
c_hist, c_bin_edges = np.histogram(year_casual_duration, range= hist_range, bins = n_bins)
print('会员直方图统计信息:\n{},\n 直方图分组边界:\n{}'.format(m_hist, m_bin_edges))
print('非会员直方图统计信息:\n{},\n直方图分组边界:\n{}'.format(c_hist, c_bin_edges))
def save_and_show_results(year_member_duration, year_casual_duration):
fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2,sharey = ax1) #y范围设置相同
#会员直方图
ax1.hist(year_member_duration, range = hist_range, bins = n_bins)
ax1.set_xticks(range(0,181,15))
ax1.set_title('Member')
ax1.set_ylabel('Count')
#会员直方图
ax2.hist(year_casual_duration, range = hist_range, bins = n_bins)
ax2.set_xticks(range(0,181,15))
ax2.set_title('Casual')
ax2.set_ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(output_path, 'type_hist.png'))
plt.show()
def main():
year_member_duration, year_casual_duration = collect_and_process_data()
analyze_data(year_member_duration, year_casual_duration)
save_and_show_results(year_member_duration, year_casual_duration)
if __name__ == '__main__':
main()
>>>会员直方图统计信息:
[2063276 628882 56364 13624 4997 2702 1552 997 695
472 368 287],
直方图分组边界:
[ 0. 15. 30. 45. 60. 75. 90. 105. 120. 135. 150. 165. 180.]
非会员直方图统计信息:
[244526 371343 132917 68430 44607 33411 23362 16571 11690 8263
5903 4557],
直方图分组边界:
[ 0. 15. 30. 45. 60. 75. 90. 105. 120. 135. 150. 165. 180.]
总结下知识点
-
合并矩阵的操作
np.concatenate([duration_data,member_data],axis=1) ) #横向拼接,放进列表
-
直方图的使用
m_hist, m_bin_edges = np.histogram(
year_member_duration, #参量
range= hist_range, #数据范围,诸如[起始数,终止数]
bins = n_bins #分多少个区间
)
其中histogram
函数直方图可以对数据进行直方图的分类操作,输出的两个参数是各区间的统计量(放入列表),以及区间的分区情况。
-
多个子图的绘制:
ax1 = fig.add_subplot(1,2,1) #2行一列排列的第一个位置
ax2 = fig.add_subplot(1,2,2,sharey = ax1) #y范围设置相同
这里面画图的时候,需要注意的是之前的设置xy轴的方法都带有了set_字样。
诸如用hist画图的时候,
ax1.hist(year_member_duration, range = hist_range, bins = n_bins)
ax1.set_xticks(range(0,181,15))
ax1.set_title('Member')
ax1.set_ylabel('Count')
练习:统计不同气温的天数直方图
题目描述:统计1-3月气温在-10℃~10℃的天数统计直方图
题目要求:
使用NumPy进行直方图统计
使用Matplotlib进行直返图绘制
数据文件:
数据源下载地址:https://video.mugglecode.com/temp2.csv(数据源与第二节练习相同)
temp2.csv,包含了2018年1-3月北京的气温(每日的最低温度)。每行记录为1天的数据。
共2列数据,第1列month为月份,第2列temperature为摄氏温度。
import os
import numpy as np
import matplotlib.pyplot as plt
file_list = ['/Users/miraco/PycharmProjects/DataMining/bikeshare/data_temp/201802_temp.csv',
'/Users/miraco/PycharmProjects/DataMining/bikeshare/data_temp/201801_temp.csv',
'/Users/miraco/PycharmProjects/DataMining/bikeshare/data_temp/201803_temp.csv'
]
output_path = './bikeshare/output'
if not os.path.exists(output_path): #如果不存在就新建一个
os.makedirs(output_path)
#数据读取
data_list = []
for file in file_list:
data = np.loadtxt(file, skiprows= 1, delimiter= ',', dtype = 'int')
data_list.append(data.reshape(-1,1))
all_data = np.concatenate(data_list)
hist_all_count, hist_range = np.histogram(all_data,range = [-10,10],bins = 20 )
print('直方图统计信息:\n{}\n,区间:\n{}'.format(hist_all_count, hist_range))
plt.figure(figsize = (4,4))
plt.hist(
all_data,
range = [-10,10],
bins = 20
)
plt.ylabel('Count')
plt.xlabel('Temperature')
plt.tight_layout()
plt.title('3 months temperature statistics')
plt.savefig(os.path.join(output_path,'temperature_hist.png'))
plt.show()
运行结果:
直方图统计信息:
[5 9 6 7 8 8 6 8 6 4 4 1 3 0 2 2 2 3 1 2]
,区间:
[-10. -9. -8. -7. -6. -5. -4. -3. -2. -1. 0. 1. 2. 3.
4. 5. 6. 7. 8. 9. 10.]