airport-天池

import pandas as pd
from dateutil.parser import parse 
import datetime
import numpy as np
import pylab as pl
from sympy import *

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

导入并预处理数据

分析wifi每天的变化,可知在凌晨4点是安检,航班以及wifi连接数为0,因此将其作为分界点

def imp_dat():
    departure=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_departure_chusai_2ndround.csv")
    flights=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_flights_chusai_2ndround.csv")
    gates=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_gates.csv")
    security_check=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\airport_gz_security_check_chusai_2ndround.csv")
    wifi_records=pd.read_csv("E:\\data_learn\\tianchi_stround\\da2\\WIFI_AP_Passenger_Records_chusai_2ndround.csv")
    return departure,flights,gates,security_check,wifi_records
# 将wifi连接上10分钟内平均
def mean_wifi(wifi_records):
    rlines=wifi_records['timeStamp'].values
    timeTen=[]
    timeStr=[]
    for i in rlines:
        timeTen.append(int(i[8:16].replace('-',''))/10*10)
        timeStr.append(i[:10])
    wifi_records['timeTen']=timeTen
    wifi_records['timeStr']=timeStr
    dfwftime=wifi_records.groupby(['WIFIAPTag','timeTen','timeStr'])
    dftenMean=dfwftime['passengerCount'].mean().reset_index()
    timeTen_str=dftenMean['timeTen'].values.astype(str)
    timeTen_str4=[]
    for i in timeTen_str:
            timeTen_str4.append(i[2:])
    dftenMean['timeTen_str']=timeTen_str
    dftenMean['timeTen_str4']=timeTen_str4
    slice10min=[]
    dftimeStr=dftenMean['timeStr'].values
    dftimeTen_str4=dftenMean['timeTen_str4'].values
    for i in xrange(dftimeStr.shape[0]):
        slice10min.append(dftimeStr[i]+'-'+dftimeTen_str4[i][:2]+'-'+dftimeTen_str4[i][2])
    dftenMean['slice10min']=slice10min
    #### delete the surperfluous data and show the "E1-1A-1 "data
    wifi_all=dftenMean.drop(['timeTen','timeStr','timeTen_str','timeTen_str4'],axis=1)
    #wifi_all['timeTen_str4']=wifi_all['timeTen_str4'].astype(int)
    ### 查看不同点的wifi数量
    df=wifi_all.groupby(['WIFIAPTag','slice10min'])
    rse=df.passengerCount.sum()
    wifi_all_split=rse.unstack().T
    return wifi_all_split
# 按天将wifi连接数分开
def getWIFIday11_24(wifi_all_split):    
    wifi_day=[]
    wifi_dayt=[]
    for i in range(10,26):
        tmp=[a  for a in wifi_all_split.index if a>='2016-09-'+str(i)+'-04-0' and a<='2016-09-'+str(i+1)+'-04-0']
        tmpt=[a  for a in wifi_all_split.index if a<='2016-09-'+str(i)+'-17-5' and a>='2016-09-'+str(i)+'-15-0']
        wifi_day.append(wifi_all_split.ix[tmp,:])
        wifi_dayt.append(wifi_all_split.ix[tmpt,:])
    return wifi_day,wifi_dayt
departure,flights,gates,security_check,wifi_records=imp_dat()
wifi_all_split=mean_wifi(wifi_records)
wifi_day,wifi_dayt=getWIFIday11_24(wifi_all_split)
# 合并航班与登机口区域
def getFla_gat(flights,gates):
    scheduled_flt=[parse(a)+datetime.timedelta(hours=8)  if type(a)==str else 0 for a in flights['scheduled_flt_time'].values]
    actual_flt=[parse(a)+datetime.timedelta(hours=8) if type(a)==str else 0  for a in flights['actual_flt_time'].values]
    flights['scheduled_flt'],flights['acutal_flt']=scheduled_flt,actual_flt

    flight_gate=pd.merge(flights,gates,on='BGATE_ID',how='left')
    oneDay_time=[str(a)[11:19] for a in flight_gate['scheduled_flt'].values]
    flight_gate['timeInDay']=oneDay_time

    late_timeAll=[]
    for i in range(flight_gate.shape[0]):
        if flight_gate.ix[i,5]!=0:
            late_timeAll.append(round((flight_gate.ix[i,5]-flight_gate.ix[i,4]).total_seconds(),0)/60)
        else:
            late_timeAll.append(-1)    
    flight_gate['late_time/min']=late_timeAll
    id_flt=[]
    for a in flight_gate.ix[:,['scheduled_flt','BGATE_ID']].astype(str).values:
        id_flt.append(a[0][-8:]+'_'+a[1])
    flight_gate['id_flt']=id_flt
    tmp=flight_gate
    del tmp['scheduled_flt_time']
    del tmp['actual_flt_time']
    return tmp
def separate_flight(flight_gate):
    all_=[]
    plane_fight_dic=[]
    for i in xrange(10,26):
        logi=[flight_gate.ix[a,'scheduled_flt']>=datetime.datetime(2016,9,i,4,0) and flight_gate.ix[a,'scheduled_flt']
flight_gate=getFla_gat(flights,gates)
fl_gt,plane_flight_dic=separate_flight(flight_gate)
D:\Anaconda2\lib\site-packages\ipykernel\__main__.py:56: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda2\lib\site-packages\ipykernel\__main__.py:57: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# res_new 为安检数,航班的合并表
res=np.load('res_new.npy')

''' 合并得到列为:id_flt   0   BGATE_ID    scheduled_flt   acutal_flt  BGATE_AREA  timeInDay   late_time/min   area_gate   id_concat的表
其中0表示此机型(非航班)对应所有的安检人数,id_concat为此机型对应的航班
'''
def fli_gat_count():
    n=len(res[0])
    all_=[]
    for i in xrange(n):
        res_cum=res[0][i].groupby(['secTime','id_flt']).size().unstack().resample('10T').sum().fillna(0).cumsum().max().reset_index()
        tmp_count=pd.merge(res_cum,fl_gt[i],how='right',on='id_flt')
        del tmp_count['flight_ID']
        tm=[]            
        for  ivs in tmp_count['flt_set'].values:
            s_=''
            for si in ivs:
                s_=s_+'_'+si             
            tm.append(s_)
        tmp_count['id_concat']=tm
        del tmp_count['flt_set']
        tmp_count=tmp_count.drop_duplicates()
        all_.append(tmp_count)
    return all_
fl_count=fli_gat_count()

建立模型

假设 每个机型对该wifi点影响数为起飞前3小时,总人数如果为N,那前3小时每10分钟的wifi比例为a_i(i=1,2,...,18).如果延期,那么延期的时间段内wifi比例一直为a_18,即最后一个比例值。将该候车厅所有机型对应的人数相加,与实际wifi数对应;最后,使用简单线性回归求取参数a_i.

每个机型对全天wifi_count的影响函数

passAr=np.array(getpassAr())*15
passAr
array([15*a_0, 15*a_1, 15*a_2, 15*a_3, 15*a_4, 15*a_5, 15*a_6, 15*a_7,
       15*a_8, 15*a_9, 15*a_10, 15*a_11, 15*a_12, 15*a_13, 15*a_14,
       15*a_15, 15*a_16, 15*a_17, 15*a_18, 15*a_19, 15*a_20, 15*a_21,
       15*a_22, 15*a_23, 15*a_24], dtype=object)
n=30;parN=25;rN=19;n_last=1;
#np.array([0]*(n-parN)+list(passAr[:rN])+list(passAr[rN])*n_last+list(passAr[-6:])+(144-n-n_last)*[0])
passAr[rN]
15*a_19
# N为该航班总人数
from dateutil.parser import parse
parN=25# 参数个数
def getpassCount(sctN):
    sct,act,N=sctN
    passAr=np.array(getpassAr())*N
    n=(sct-parse(str(sct.date())+' 04:00:00')).seconds/600
    n_last=(act-sct).seconds/600
    rN=parN-6
    if n<=parN and (n+n_last)<=144:
        return np.array(list(passAr[-n:-6])+[passAr[-6]]*n_last+list(passAr[-6:])*(144-n-n_last-6)*[0])
    elif n<=parN and (n+n_last)>150:
        return np.array(list(passAr[-n:-6])+[passAr[rN]]*(150-n))
    elif  n<=parN and (n+n_last)<150 and (n+n_last)>144:
        return np.array(list(passAr[-n:-6])+[passAr[rN]]*n_last+list(passAr[rN:(rN-n-n_last+150)]))
    elif n>parN and (n+n_last)<=144:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[rN]]*n_last+list(passAr[-6:])+(144-n-n_last)*[0])
    elif n>parN and (n+n_last)>150:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[rN]]*(150-n))
    elif n>parN and (n+n_last)<150 and (n+n_last)>144:
        return np.array([0]*(n-parN)+list(passAr[:rN])+[passAr[-6]]*(n_last)+list(passAr[rN:(rN-n-n_last+150)]))

一个入口所有航班wifi_count

def allofgate(N=1,gate_id='A01'): 
    #入口所有航班列表[]
    #allflights=[[sct,N],[sct,N],..,[sct,N]]
    fl_counN=fl_count[N]
    gatN=fl_counN[fl_counN['BGATE_ID']==gate_id].ix[:,['scheduled_flt','acutal_flt',0]].values
    n=len(gatN)
    all_=getpassCount(gatN[0])
    for i in xrange(1,n):
        try:
            all_=all_+getpassCount(gatN[i])
        except:
            print i
    return all_
fl_counNall=fl_count[0]
for i in range(1,len(fl_count)):
    try:
        tmp=fl_count[i]
        tmp['day']=[a[0:10] for a in fl_count[i]['scheduled_flt'].astype(str)]
        fl_counNall=fl_counNall.append(tmp)
    except:
        print i

fl_gb=fl_counNall.groupby(['BGATE_ID','day']).size().unstack().fillna(0)

fl_counNall.fillna(0,inplace=True)
gate_nan=fl_counNall[fl_counNall['BGATE_ID']==0]

#fl_counNall.groupby(['BGATE_ID','day']).size().unstack().fillna(0).head(15)

一个区域对应的wifi点

def getwifiArea():
    wifiTag=wifi_day[1].columns
    wi_dic={}
    for a in wifiTag:
        b=a[:2]
        wi_dic[b]=wi_dic.get(b,[])+[a]
    return wi_dic
wi_dic=getwifiArea()
### 定义起飞前18个10分钟内数量参数
from sympy import *
parN=25# 定义的参数个数
def getpassAr():
    parabc=[]
    for i in range(parN):
        parabc.append('a_'+str(i))
    passAr=symbols(parabc)
    return passAr
parabc=getpassAr()
def getFactor(spy,parms=parabc):
    if type(spy)==int or type(spy)==float or type(spy)==str:
        return np.array([0]*parN)
    if spy.is_Add is False:
        tmp=[]
        for j in parms:
            tmp.append(int(spy.coeff(j)))
        return np.array(tmp)    
    args=spy.args
    num_list=np.array([0]*parN)
    for i in args:
        tmp=[]
        for j in parms:
            tmp.append(int(i.coeff(j)))
        num_list=num_list+np.array(tmp)        
    return np.array(num_list)
def factorMatrix(p1):
    tmp=[]
    for i in p1:
        try:
            tmp.append(getFactor(i))
        except:
            print i
    return np.array(tmp)   

线性回归分析

前N天所有数据

x_list,y_list=[],[]
for n in range(1,15):
    x=factorMatrix(allofgate(N=n,gate_id='A101'))
    y=wifi_day[n].ix[:,'E1-3A'].values[:-1]
    x,y=x[:60],y[:60]        
    x_list,y_list=x_list+list(x),y_list+list(y)
from sklearn.linear_model import LinearRegression
rg=LinearRegression()
rg.fit(x_list,y_list)
pl.plot(rg.coef_)

你可能感兴趣的:(airport-天池)