Python公交站点数据习题

#读取gps文件夹下面的所有文件合并为一个数据框

from pylab import mpl
import pandas as pd
mpl.rcParams['font.sans-serif'] = ['SimHei']
import os
path='D:/Rpython/file/time/'  #文件夹下有多个文件的读取
list1=os.listdir(path)
import matplotlib.pyplot as plt
data1=pd.DataFrame()
for i in list1:
    data=pd.read_csv(path+i,engine='python')
    data1=pd.concat([data1,data])

#提取68路的数据

temp=data1[data1.线路名称=='68路']

#画出散点图

plt.figure()
plt.scatter(temp.iloc[:,0],temp.iloc[:,1]) #选取的是公交车经度列和纬度列
plt.show()

#聚类找到30个站,将虚拟类标签转换为实际站点

def cengci(temp,num):
    from scipy.cluster import hierarchy
    Z = hierarchy.linkage(temp, method ='ward',metric='euclidean')   #(数据,层次聚类的方法,度量:欧氏距离)
    label = hierarchy.cut_tree(Z, height=num)  # 取固定位置的类别数
    label = label.reshape(label.size, )
    return (label)
from Cengci import cengci  #层次是自己写的一个相当于第三方库
import numpy as np
for i in np.arange(0,0.05,0.001):
    nu=len(set(cengci(temp.iloc[:,:2], i)))
    if nu<31:
        break
    else:
        print('当前高度为{},聚类数目为{}'.format(i,nu))

label=cengci(temp.iloc[:,:2],0.042)
temp['站点']=label

#画出带有实际站点标记的散点图

list2=[9,1,21,16,19,0,17,4,5,26,24,23,10,20,3,28,29,11,12,27,6,18,15,7,2,22,8,14,13,25]
for i in list2:
    temp.loc[label==i,'站点']=list2.index(i)+1
plt.figure()
for i in range(1,31):
    tem=temp[temp.站点==i]
    plt.scatter(tem.iloc[:,0],tem.iloc[:,1])
    plt.text(tem.iloc[:,0].mean(),tem.iloc[:,1].mean(),i)
    plt.show()

#将数据按早晚高峰,分为5个时间段,并统计各自站点上车的人数,合并为数据框

lis=[5,8,10,17,19,23]
temp['业务时间']=[i.time() for i in pd.to_datetime(temp.业务时间,format='%Y%m%d %H:%M')]
lis=[i.time() for i in pd.to_datetime(lis,format='%H')]
weigth=pd.DataFrame()
for i in range(5):
    ban=temp[(temp.业务时间lis[i])]
    ban1=ban.站点.value_counts()
    weigth=pd.concat([weigth,ban1],axis=1)

lis=[5,8,10,17,19,23]
weigth.columns=[str(lis[i])+'-'+str(lis[i+1]) for i in range(5)]

plt.figure()
for i in range(5):
    plt.plot(weigth.iloc[:,i]/(lis[i+1]-lis[i]))
    plt.legend(weigth.columns)
plt.show()

from pyecharts import Line
line=Line('折线图')
for i in range(5):
    line.add('%d-%d个时段'%(lis[i],lis[i+1]),list(range(30)),weigth.iloc[:,i]/(lis[i+1]-lis[i]))
line.render('zhexiantu1.html')

#构建泊松分布的下车概率矩阵

def posong(r,k):
    import math
    from functools import reduce
    if k==0:
        num=(math.e**(-r))*(r**k)
    else:
        num=(math.e**(-r))*(r**k)/reduce(lambda x,y:x*y,range(1,k+1))
    return(num)
sum([posong(5,i) for i in range(0,11)])
li=[posong(5,i) for i in range(0,11)]
matrix=np.zeros([30,30])
for i in range(30):
    for j in range(i,30):
        matrix[i,j]=posong(15,j-i)
for i in range(30):
    matrix[i,:]=matrix[i,:]/(matrix[i,:].sum())

#构建加权下车概率矩阵,权重为每站的上车人数

for j in range(5):
    matrix1=matrix.copy()
    bang=weigth.iloc[:,j]
    bang.index=range(0,30)
    for i in range(30):
        matrix1[i,:]=matrix[i,:]*bang/((matrix[i,:]*bang).sum())

#求出每个站的下车人数,构建OD矩

    for i in range(30):
        OD[:30,i]=round(matrix1[:,i]*bang,0)
        OD[30,i]=OD[:30,i].sum()
    OD[:30,30]=[OD[i,:].sum() for i in range(30)]
    pd.DataFrame(OD).to_excel('OD_matrix.xls',index=False,header=None)

你可能感兴趣的:(大数据分析,Python习题)