关注CSDN:程志伟的博客
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
colors=sns.color_palette("deep")
H:\Anaconda3\lib\site-packages\statsmodels\tools\_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
#读取飞机航班数据
data=pd.read_csv('H:/0date/airport-ontime.csv')
data.head()
Out[2]:
FL_DATE UNIQUE_CARRIER ... DISTANCE_GROUP Unnamed: 16
0 2014-06-01 AA ... 10 NaN
1 2014-06-01 AA ... 10 NaN
2 2014-06-01 AA ... 10 NaN
3 2014-06-01 AA ... 10 NaN
4 2014-06-01 AA ... 11 NaN
[5 rows x 17 columns]
#查看数据的维度
data.shape
Out[3]: (502617, 17)
#查看数据的列名
data.columns
Out[4]:
Index(['FL_DATE', 'UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID',
'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN_STATE_ABR',
'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID',
'DEST_STATE_ABR', 'DEP_DELAY_NEW', 'DEP_DEL15', 'ARR_DELAY_NEW',
'ARR_DEL15', 'DISTANCE', 'DISTANCE_GROUP', 'Unnamed: 16'],
dtype='object')
#查看各列数据的信息
data.info()
RangeIndex: 502617 entries, 0 to 502616
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FL_DATE 502617 non-null object
1 UNIQUE_CARRIER 502617 non-null object
2 ORIGIN_AIRPORT_ID 502617 non-null int64
3 ORIGIN_AIRPORT_SEQ_ID 502617 non-null int64
4 ORIGIN_CITY_MARKET_ID 502617 non-null int64
5 ORIGIN_STATE_ABR 502617 non-null object
6 DEST_AIRPORT_ID 502617 non-null int64
7 DEST_AIRPORT_SEQ_ID 502617 non-null int64
8 DEST_CITY_MARKET_ID 502617 non-null int64
9 DEST_STATE_ABR 502617 non-null object
10 DEP_DELAY_NEW 492974 non-null float64
11 DEP_DEL15 492974 non-null float64
12 ARR_DELAY_NEW 490716 non-null float64
13 ARR_DEL15 490716 non-null float64
14 DISTANCE 502617 non-null float64
15 DISTANCE_GROUP 502617 non-null int64
16 Unnamed: 16 0 non-null float64
dtypes: float64(6), int64(7), object(4)
memory usage: 65.2+ MB
#删除全部是缺失值的列
data.dropna(how='all',inplace=True,axis=1)
data.info()
RangeIndex: 502617 entries, 0 to 502616
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FL_DATE 502617 non-null object
1 UNIQUE_CARRIER 502617 non-null object
2 ORIGIN_AIRPORT_ID 502617 non-null int64
3 ORIGIN_AIRPORT_SEQ_ID 502617 non-null int64
4 ORIGIN_CITY_MARKET_ID 502617 non-null int64
5 ORIGIN_STATE_ABR 502617 non-null object
6 DEST_AIRPORT_ID 502617 non-null int64
7 DEST_AIRPORT_SEQ_ID 502617 non-null int64
8 DEST_CITY_MARKET_ID 502617 non-null int64
9 DEST_STATE_ABR 502617 non-null object
10 DEP_DELAY_NEW 492974 non-null float64
11 DEP_DEL15 492974 non-null float64
12 ARR_DELAY_NEW 490716 non-null float64
13 ARR_DEL15 490716 non-null float64
14 DISTANCE 502617 non-null float64
15 DISTANCE_GROUP 502617 non-null int64
dtypes: float64(5), int64(7), object(4)
memory usage: 61.4+ MB
#计算各列的缺失数量
data.isnull().sum()
Out[7]:
FL_DATE 0
UNIQUE_CARRIER 0
ORIGIN_AIRPORT_ID 0
ORIGIN_AIRPORT_SEQ_ID 0
ORIGIN_CITY_MARKET_ID 0
ORIGIN_STATE_ABR 0
DEST_AIRPORT_ID 0
DEST_AIRPORT_SEQ_ID 0
DEST_CITY_MARKET_ID 0
DEST_STATE_ABR 0
DEP_DELAY_NEW 9643
DEP_DEL15 9643
ARR_DELAY_NEW 11901
ARR_DEL15 11901
DISTANCE 0
DISTANCE_GROUP 0
dtype: int64
#查看数据的前5数据
data.DEP_DEL15.head()
Out[8]:
0 0.0
1 0.0
2 0.0
3 1.0
4 0.0
Name: DEP_DEL15, dtype: float64
#查看数据的百分比
data.DEP_DEL15.value_counts()/len(data)
Out[9]:
0.0 0.731812
1.0 0.249003
Name: DEP_DEL15, dtype: float64
data.ARR_DEL15.value_counts()/len(data)
Out[10]:
0.0 0.718261
1.0 0.258061
Name: ARR_DEL15, dtype: float64
#查看每个出发地的飞机延迟数量
by_origin_state=data.groupby('ORIGIN_STATE_ABR')
departure_delay_counts=by_origin_state.DEP_DEL15.sum()
departure_delay_counts.sort_values(ascending=False)
Out[11]:
ORIGIN_STATE_ABR
TX 19016.0
CA 15976.0
IL 12174.0
FL 7862.0
GA 7799.0
CO 6042.0
NY 4732.0
NV 3995.0
AZ 3939.0
VA 3267.0
NC 3138.0
NJ 2955.0
MO 2628.0
MD 2426.0
MN 2369.0
MI 2301.0
WA 2009.0
PA 1999.0
TN 1916.0
MA 1796.0
OH 1594.0
LA 1473.0
UT 1305.0
WI 1165.0
OR 1057.0
OK 877.0
KY 861.0
IN 801.0
AR 789.0
NM 745.0
AL 641.0
SC 638.0
AK 524.0
NE 523.0
HI 496.0
PR 434.0
IA 418.0
CT 293.0
KS 265.0
MS 255.0
SD 250.0
ND 245.0
ID 218.0
MT 164.0
ME 146.0
RI 144.0
WY 123.0
VT 108.0
NH 99.0
WV 89.0
VI 46.0
DE 19.0
TT 9.0
Name: DEP_DEL15, dtype: float64
#查看目的地的飞机晚点数量
by_dest_state=data.groupby('DEST_STATE_ABR')
arrival_delay_counts=by_dest_state.ARR_DEL15.sum()
arrival_delay_counts.sort_values(ascending=False)
Out[12]:
DEST_STATE_ABR
TX 17849.0
CA 17700.0
IL 11529.0
FL 7537.0
GA 6566.0
NY 6270.0
CO 5361.0
VA 3663.0
AZ 3572.0
NV 3562.0
NJ 3076.0
NC 2961.0
MO 2724.0
MN 2434.0
WA 2412.0
PA 2403.0
MA 2303.0
MI 2300.0
TN 2085.0
OH 2036.0
MD 1924.0
LA 1603.0
WI 1546.0
UT 1412.0
OR 1407.0
OK 1100.0
KY 1049.0
IN 981.0
AR 866.0
NM 845.0
AL 803.0
HI 735.0
SC 731.0
AK 717.0
NE 670.0
IA 595.0
PR 568.0
CT 526.0
ND 389.0
KS 383.0
SD 360.0
ID 328.0
MS 317.0
RI 301.0
MT 270.0
ME 192.0
NH 189.0
WY 188.0
VT 153.0
WV 99.0
VI 73.0
DE 30.0
TT 13.0
Name: ARR_DEL15, dtype: float64
#将上面的数据合并
delay_df = pd.DataFrame([departure_delay_counts, arrival_delay_counts]).T
delay_df
Out[13]:
DEP_DEL15 ARR_DEL15
AK 524.0 717.0
AL 641.0 803.0
AR 789.0 866.0
AZ 3939.0 3572.0
CA 15976.0 17700.0
CO 6042.0 5361.0
CT 293.0 526.0
DE 19.0 30.0
FL 7862.0 7537.0
GA 7799.0 6566.0
HI 496.0 735.0
IA 418.0 595.0
ID 218.0 328.0
IL 12174.0 11529.0
IN 801.0 981.0
KS 265.0 383.0
KY 861.0 1049.0
LA 1473.0 1603.0
MA 1796.0 2303.0
MD 2426.0 1924.0
ME 146.0 192.0
MI 2301.0 2300.0
MN 2369.0 2434.0
MO 2628.0 2724.0
MS 255.0 317.0
MT 164.0 270.0
NC 3138.0 2961.0
ND 245.0 389.0
NE 523.0 670.0
NH 99.0 189.0
NJ 2955.0 3076.0
NM 745.0 845.0
NV 3995.0 3562.0
NY 4732.0 6270.0
OH 1594.0 2036.0
OK 877.0 1100.0
OR 1057.0 1407.0
PA 1999.0 2403.0
PR 434.0 568.0
RI 144.0 301.0
SC 638.0 731.0
SD 250.0 360.0
TN 1916.0 2085.0
TT 9.0 13.0
TX 19016.0 17849.0
UT 1305.0 1412.0
VA 3267.0 3663.0
VI 46.0 73.0
VT 108.0 153.0
WA 2009.0 2412.0
WI 1165.0 1546.0
WV 89.0 99.0
WY 123.0 188.0
#绘制条形图
delay_df.sort_values('DEP_DEL15', ascending=False).plot(kind='bar', title='Number of delayed flights by state')
Out[14]:

data.ORIGIN_STATE_ABR.value_counts()
Out[15]:
CA 64034
TX 61504
IL 34573
FL 34502
GA 32328
NY 22438
CO 21562
AZ 15519
VA 14122
NC 13868
NV 13339
MI 11560
WA 11315
MN 10620
NJ 10045
MA 9829
UT 9791
MO 9432
PA 9424
HI 8364
MD 7996
TN 7221
OH 6915
LA 6007
OR 5935
WI 5088
AK 3642
IN 3527
KY 3517
OK 3444
AL 2671
NM 2610
SC 2577
PR 2540
AR 2535
NE 2052
CT 1871
MT 1716
IA 1643
ID 1589
ND 1456
MS 1147
SD 1096
KS 1071
RI 1007
WY 964
ME 696
NH 601
VT 472
VI 431
WV 294
DE 76
TT 41
Name: ORIGIN_STATE_ABR, dtype: int64
#根据飞机晚点的数量求百分比
pct_departure_delay = departure_delay_counts / data.ORIGIN_STATE_ABR.value_counts()
pct_arrival_delay = arrival_delay_counts / data.DEST_STATE_ABR.value_counts()
pct_departure_delay.sort_values(ascending=False).plot(kind='bar', title='% flights with departure delays by origin state')
Out[16]:

pct_arrival_delay.sort_values(ascending=False).plot(kind='bar', color=colors[1], title='% flights with arrival delay by destination state')
Out[17]:

pct_delay_df = pd.DataFrame([pct_departure_delay, pct_arrival_delay], index=['PCT_DEP_DEL15', 'PCT_ARR_DEL15']).T
pct_delay_df.sort_values('PCT_ARR_DEL15', ascending=False).plot(kind='bar', title='Overlapping % delay plots for comparison')
Out[18]:

#### 机场间延迟趋势 ####
delay_counts_df = data[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'ARR_DEL15']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).sum()
delay_counts_df.head()
Out[19]:
ARR_DEL15
ORIGIN_STATE_ABR DEST_STATE_ABR
AK AK 351.0
AZ 5.0
CA 11.0
CO 21.0
GA 3.0
support = (delay_counts_df / len(data))
support.head()
Out[20]:
ARR_DEL15
ORIGIN_STATE_ABR DEST_STATE_ABR
AK AK 0.000698
AZ 0.000010
CA 0.000022
CO 0.000042
GA 0.000006
support = support.unstack()
support.head()
Out[21]:
ARR_DEL15 ...
DEST_STATE_ABR AK AL AR AZ ... WA WI WV WY
ORIGIN_STATE_ABR ...
AK 0.000698 NaN NaN 0.000010 ... 0.000209 NaN NaN NaN
AL NaN NaN NaN NaN ... NaN NaN NaN NaN
AR NaN NaN NaN 0.000008 ... NaN NaN NaN NaN
AZ 0.000026 NaN 0.000008 0.000129 ... 0.000290 0.000062 NaN NaN
CA 0.000056 NaN 0.000008 0.001846 ... 0.001423 0.000068 NaN 0.0
[5 rows x 53 columns]
'''
上面的support是有2级index的列(multilevel index),
我们如果想去掉外面的ARR_DEL15可以先转列到行,然后去掉index,
之后在转换回去。最终我们就可以得到一个始发-终到机场的延迟比例
'''
support = support.T.reset_index(level=0, drop=True).T
support.head()
Out[22]:
DEST_STATE_ABR AK AL AR AZ ... WA WI WV WY
ORIGIN_STATE_ABR ...
AK 0.000698 NaN NaN 0.000010 ... 0.000209 NaN NaN NaN
AL NaN NaN NaN NaN ... NaN NaN NaN NaN
AR NaN NaN NaN 0.000008 ... NaN NaN NaN NaN
AZ 0.000026 NaN 0.000008 0.000129 ... 0.000290 0.000062 NaN NaN
CA 0.000056 NaN 0.000008 0.001846 ... 0.001423 0.000068 NaN 0.0
[5 rows x 53 columns]
def asymmatplot(plotmat, names=None, cmap="Greys", cmap_range=None, ax=None, **kwargs):
'''
Plot an asymmetric matrix with colormap and statistic values. A modification of the
symmatplot() function in Seaborn to show the upper-half of the matrix.
See https://github.com/mwaskom/seaborn/blob/master/seaborn/linearmodels.py for the original.
'''
if ax is None:
ax = plt.gca()
nvars = len(plotmat)
if cmap_range is None:
vmax = np.nanmax(plotmat) * 1.15
vmin = np.nanmin(plotmat) * 1.15
elif len(cmap_range) == 2:
vmin, vmax = cmap_range
else:
raise ValueError("cmap_range argument not understood")
mat_img = ax.matshow(plotmat, cmap=cmap, vmin=vmin, vmax=vmax, **kwargs)
plt.colorbar(mat_img, shrink=.75)
ax.xaxis.set_ticks_position("bottom")
ax.set_xticklabels(names, rotation=90)
ax.set_yticklabels(names)
minor_ticks = np.linspace(-.5, nvars - 1.5, nvars)
ax.set_xticks(minor_ticks, True)
ax.set_yticks(minor_ticks, True)
major_ticks = np.linspace(0, nvars - 1, nvars)
ax.set_xticks(major_ticks)
ax.set_yticks(major_ticks)
ax.grid(False, which="major")
ax.grid(True, which="minor", linestyle="-")
return ax
fig, ax = plt.subplots(figsize=(18,18))
asymmatplot(support, names=support.columns, ax=ax, cmap='OrRd')
Out[24]:

##求比例
trip_counts_df = data[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'FL_DATE']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).count()
delay_counts_df = delay_counts_df.rename({'ARR_DEL15' : 'COUNTS'}, axis=1)
trip_counts_df = trip_counts_df.rename({'FL_DATE' : 'COUNTS'}, axis=1)
##用延迟航班数量除以总航班数
mat = (delay_counts_df / trip_counts_df).unstack().T.reset_index(level=0, drop=True).T
fig, ax = plt.subplots(figsize=(18,18))
asymmatplot(mat, names=mat.columns, ax=ax, cmap='OrRd', cmap_range=(0., 1.0))
Out[28]:

#### 每日延迟分布情况 ####
fig, ax = plt.subplots(figsize=(18,10))
sns.boxplot(data.ARR_DELAY_NEW, data.FL_DATE, ax=ax)
fig.autofmt_xdate()

fig, ax = plt.subplots(figsize=(18,10))
sns.boxplot(data.ARR_DELAY_NEW, data.FL_DATE, ax=ax, showfliers=False) #showflier
fig.autofmt_xdate()