import pandas as pd
import numpy as np
#06-08预处理
df1=pd.read_csv('2006.csv')
df2=pd.read_csv('2007.csv')
df3=pd.read_csv('2008.csv')
df=pd.concat([df1,df2,df3])
df=df.fillna(0)
df.to_csv('06-08.csv',index=None)
df=pd.read_csv('06-08.csv')
df=df.drop(labels=['Year',
'Month','DayofMonth','DayOfWeek','UniqueCarrier','FlightNum','TailNum'], axis=1)
df.to_csv('r06-08.csv',index=None)
#06-08计算
df=pd.read_csv('r06-08.csv')
df1=df['Origin'].value_counts()
df2=df['Dest'].value_counts()
df1.to_csv('ORIGIN.csv')
df2.to_csv('DEST.csv')
#计算起飞和到达航班总数
df1=pd.read_csv('ORIGIN.csv')
df2=pd.read_csv('DEST.csv')
df=pd.merge(df1, df2, on=['airport'], how='outer')
df=df.fillna(0)
df['Sum']=df['Origin']+df['Dest']
df.to_csv('Airports.csv',index=None)
#计算到达延迟超过10分钟航班的比例,出发延迟超过10分钟航班的比例,取消航班的比例,该机场所有航线的平均距离
df=pd.read_csv('Airports.csv')
ss=['ArrDelay10','DepDelay10','Cancelled','AveDistance']
for s in ss:
df[s]=0
df0=pd.read_csv('r06-08.csv')
ii=0
for i in df['airport']:
num1=len(df0[(df0['Dest']==i) & (df0['ArrDelay']>10)])
df['ArrDelay10'].loc[ii]=num1/df['Dest'].loc[ii] #计算到达延迟超过10分钟航班的比例
num2 = len(df0[(df0['Origin'] == i) & (df0['DepDelay'] > 10)])
df['DepDelay10'].loc[ii] = num2 / df['Origin'].loc[ii] #计算出发延迟超过10分钟航班的比例
num3 = len(df0[(df0['Cancelled'] == 1) & (df0['Dest'] == i)])
num4 = len(df0[(df0['Cancelled'] == 1) & (df0['Origin'] == i)])
df['Cancelled'].loc[ii] = (num3+num4) / df['Sum'].loc[ii] # 计算取消航班的比例
df00=df0[(df0['Origin'] == i) | (df0['Dest'] == i)]
df['AveDistance'].loc[ii] =df00['Distance'].sum() / df['Sum'].loc[ii] #
计算所有航线的平均距离
ii=ii+1
df.to_csv('Airport.csv',index=None)
#新属性计算
#计算到达延迟超过10分钟航班的比例,出发延迟超过10分钟航班的比例,取消航班的比例,该机场所有航线的平均距离
df=pd.read_csv('Airport.csv')
df=df.fillna(0)
ss=['ArrDelay30','DepDelay30','SumDelay10','SumDelay30','AveArrDelay','AveDepDelay','AveArrDelay0','AveDepDelay0']
for s in ss:
df[s]=0
df0=pd.read_csv('r06-08.csv')
ii=0
for i in df['airport']:
num1=len(df0[(df0['Dest']==i) & (df0['ArrDelay']>30)])
df['ArrDelay30'].loc[ii]=num1/df['Dest'].loc[ii] #计算到达延迟超过30分钟航班的比例
num2 = len(df0[(df0['Origin'] == i) & (df0['DepDelay'] > 30)])
df['DepDelay30'].loc[ii] = num2 / df['Origin'].loc[ii] #计算出发延迟超过30分钟航班的比例
df['SumDelay30'].loc[ii] = (num1+num2)/df['Sum'].loc[ii]#计算总延迟超过30分钟航班的比例
num3 = len(df0[(df0['Dest'] == i) & (df0['ArrDelay'] > 10)])
num4 = len(df0[(df0['Origin'] == i) & (df0['DepDelay'] > 10)])
df['SumDelay10'].loc[ii] = (num3 + num4) / df['Sum'].loc[ii] # 计算总延迟超过10分钟航班的比例
df00=df0[(df0['Dest'] == i) & (df0['ArrDelay'] > 0)]
df['AveArrDelay'].loc[ii] =df00['ArrDelay'].sum() / len(df00) #
计算所有航班的平均到达延误时间(+)
df01 = df0[(df0['Origin'] == i) & (df0['DepDelay'] > 0)]
df['AveDepDelay'].loc[ii] = df01['DepDelay'].sum() / len(df01) #
计算所有航班的平均出发延误时间(+)
df02 = df0[(df0['Dest'] == i)]
df['AveArrDelay0'].loc[ii] = df02['ArrDelay'].sum() / len(df02) #
计算所有航班的平均到达延误时间(all)
df03 = df0[(df0['Origin'] == i)]
df['AveDepDelay0'].loc[ii] = df03['DepDelay'].sum() / len(df03) #
计算所有航班的平均出发延误时间(all)
ii=ii+1
df.to_csv('Airports_extended.csv',index=None)
#计算到达延迟超过10分钟航班的比例,出发延迟超过10分钟航班的比例,取消航班的比例,该机场所有航线的平均距离
df=pd.read_csv('Airports_extended.csv')
df=df.fillna(0)
ss=['DelayforA','DelayforB','DelayforC','DelayforD','lat','long']
for s in ss:
df[s]=0
df0=pd.read_csv('r06-08.csv')
dfa=pd.read_csv('airports.csv')
ii=0