Telecom Shanghai Dataset (kaggle.com)
每15天一个表
import pandas as pd
data=pd.read_excel('Downloads/Telecom Shanghai Dataset/data_10.110.15.xlsx',names=['Data','start time','end time','cell station lon','cell station lat','user id'])
data1=data.dropna()
data1
import datetime
data2=data1[(data1['start time']>=datetime.datetime(2014,10,15)) & (data1['start time']
data2=data2.sort_values(by=['user id','start time'])
data2
记录当前位置和前一时刻的位置
data2['location']=data2['cell station lon'].astype(str)+'_'+data2['cell station lat'].astype(str)
data2
data2['prev_location']=data2['location'].shift(1)
data2
data2['location_changed']=(data2['location']!=data2['prev_location'])
data2
data3=data2[data2['location_changed']==True]
data3
data3=data3[[ 'start time', 'end time', 'cell station lon',
'cell station lat', 'user id']]
data3
计算每一个用户id出现的次数
iid=data3.groupby('user id').size().reset_index(name='count')
iid
iid['count'].describe()
'''
count 2956.000000
mean 4.675237
std 4.769128
min 1.000000
25% 2.000000
50% 3.000000
75% 6.000000
max 69.000000
Name: count, dtype: float64
'''
iid=iid[iid['count']>10]
iid
data4=data3[data3['user id'].isin(iid['user id'])]
data4
import numpy as np
tmp=data4[data4['user id']=='00a05a4f2b937fd38888c03213c4deb2'].reset_index()
tra_lst=[]
for j in range(tmp.shape[0]):
tra_lst.append([tmp.at[j,'cell station lon'],tmp.at[j,'cell station lat']])
tra_lst=np.array(tra_lst)
m=folium.Map(location=tra_lst.mean(axis=0),zoom_start=13)
for i in tra_lst:
folium.Marker(location=i).add_to(m)
folium.PolyLine(locations=tra_lst).add_to(m)
m