python飞机_Python飞行案例分析,飞机,航班

关注CSDN:程志伟的博客

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

colors=sns.color_palette("deep")

H:\Anaconda3\lib\site-packages\statsmodels\tools\_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.

import pandas.util.testing as tm

#读取飞机航班数据

data=pd.read_csv('H:/0date/airport-ontime.csv')

data.head()

Out[2]:

FL_DATE UNIQUE_CARRIER  ...  DISTANCE_GROUP  Unnamed: 16

0  2014-06-01             AA  ...              10          NaN

1  2014-06-01             AA  ...              10          NaN

2  2014-06-01             AA  ...              10          NaN

3  2014-06-01             AA  ...              10          NaN

4  2014-06-01             AA  ...              11          NaN

[5 rows x 17 columns]

#查看数据的维度

data.shape

Out[3]: (502617, 17)

#查看数据的列名

data.columns

Out[4]:

Index(['FL_DATE', 'UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID',

'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN_STATE_ABR',

'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID',

'DEST_STATE_ABR', 'DEP_DELAY_NEW', 'DEP_DEL15', 'ARR_DELAY_NEW',

'ARR_DEL15', 'DISTANCE', 'DISTANCE_GROUP', 'Unnamed: 16'],

dtype='object')

#查看各列数据的信息

data.info()

RangeIndex: 502617 entries, 0 to 502616

Data columns (total 17 columns):

#   Column                 Non-Null Count   Dtype

---  ------                 --------------   -----

0   FL_DATE                502617 non-null  object

1   UNIQUE_CARRIER         502617 non-null  object

2   ORIGIN_AIRPORT_ID      502617 non-null  int64

3   ORIGIN_AIRPORT_SEQ_ID  502617 non-null  int64

4   ORIGIN_CITY_MARKET_ID  502617 non-null  int64

5   ORIGIN_STATE_ABR       502617 non-null  object

6   DEST_AIRPORT_ID        502617 non-null  int64

7   DEST_AIRPORT_SEQ_ID    502617 non-null  int64

8   DEST_CITY_MARKET_ID    502617 non-null  int64

9   DEST_STATE_ABR         502617 non-null  object

10  DEP_DELAY_NEW          492974 non-null  float64

11  DEP_DEL15              492974 non-null  float64

12  ARR_DELAY_NEW          490716 non-null  float64

13  ARR_DEL15              490716 non-null  float64

14  DISTANCE               502617 non-null  float64

15  DISTANCE_GROUP         502617 non-null  int64

16  Unnamed: 16            0 non-null       float64

dtypes: float64(6), int64(7), object(4)

memory usage: 65.2+ MB

#删除全部是缺失值的列

data.dropna(how='all',inplace=True,axis=1)

data.info()

RangeIndex: 502617 entries, 0 to 502616

Data columns (total 16 columns):

#   Column                 Non-Null Count   Dtype

---  ------                 --------------   -----

0   FL_DATE                502617 non-null  object

1   UNIQUE_CARRIER         502617 non-null  object

2   ORIGIN_AIRPORT_ID      502617 non-null  int64

3   ORIGIN_AIRPORT_SEQ_ID  502617 non-null  int64

4   ORIGIN_CITY_MARKET_ID  502617 non-null  int64

5   ORIGIN_STATE_ABR       502617 non-null  object

6   DEST_AIRPORT_ID        502617 non-null  int64

7   DEST_AIRPORT_SEQ_ID    502617 non-null  int64

8   DEST_CITY_MARKET_ID    502617 non-null  int64

9   DEST_STATE_ABR         502617 non-null  object

10  DEP_DELAY_NEW          492974 non-null  float64

11  DEP_DEL15              492974 non-null  float64

12  ARR_DELAY_NEW          490716 non-null  float64

13  ARR_DEL15              490716 non-null  float64

14  DISTANCE               502617 non-null  float64

15  DISTANCE_GROUP         502617 non-null  int64

dtypes: float64(5), int64(7), object(4)

memory usage: 61.4+ MB

#计算各列的缺失数量

data.isnull().sum()

Out[7]:

FL_DATE                      0

UNIQUE_CARRIER               0

ORIGIN_AIRPORT_ID            0

ORIGIN_AIRPORT_SEQ_ID        0

ORIGIN_CITY_MARKET_ID        0

ORIGIN_STATE_ABR             0

DEST_AIRPORT_ID              0

DEST_AIRPORT_SEQ_ID          0

DEST_CITY_MARKET_ID          0

DEST_STATE_ABR               0

DEP_DELAY_NEW             9643

DEP_DEL15                 9643

ARR_DELAY_NEW            11901

ARR_DEL15                11901

DISTANCE                     0

DISTANCE_GROUP               0

dtype: int64

#查看数据的前5数据

data.DEP_DEL15.head()

Out[8]:

0    0.0

1    0.0

2    0.0

3    1.0

4    0.0

Name: DEP_DEL15, dtype: float64

#查看数据的百分比

data.DEP_DEL15.value_counts()/len(data)

Out[9]:

0.0    0.731812

1.0    0.249003

Name: DEP_DEL15, dtype: float64

data.ARR_DEL15.value_counts()/len(data)

Out[10]:

0.0    0.718261

1.0    0.258061

Name: ARR_DEL15, dtype: float64

#查看每个出发地的飞机延迟数量

by_origin_state=data.groupby('ORIGIN_STATE_ABR')

departure_delay_counts=by_origin_state.DEP_DEL15.sum()

departure_delay_counts.sort_values(ascending=False)

Out[11]:

ORIGIN_STATE_ABR

TX    19016.0

CA    15976.0

IL    12174.0

FL     7862.0

GA     7799.0

CO     6042.0

NY     4732.0

NV     3995.0

AZ     3939.0

VA     3267.0

NC     3138.0

NJ     2955.0

MO     2628.0

MD     2426.0

MN     2369.0

MI     2301.0

WA     2009.0

PA     1999.0

TN     1916.0

MA     1796.0

OH     1594.0

LA     1473.0

UT     1305.0

WI     1165.0

OR     1057.0

OK      877.0

KY      861.0

IN      801.0

AR      789.0

NM      745.0

AL      641.0

SC      638.0

AK      524.0

NE      523.0

HI      496.0

PR      434.0

IA      418.0

CT      293.0

KS      265.0

MS      255.0

SD      250.0

ND      245.0

ID      218.0

MT      164.0

ME      146.0

RI      144.0

WY      123.0

VT      108.0

NH       99.0

WV       89.0

VI       46.0

DE       19.0

TT        9.0

Name: DEP_DEL15, dtype: float64

#查看目的地的飞机晚点数量

by_dest_state=data.groupby('DEST_STATE_ABR')

arrival_delay_counts=by_dest_state.ARR_DEL15.sum()

arrival_delay_counts.sort_values(ascending=False)

Out[12]:

DEST_STATE_ABR

TX    17849.0

CA    17700.0

IL    11529.0

FL     7537.0

GA     6566.0

NY     6270.0

CO     5361.0

VA     3663.0

AZ     3572.0

NV     3562.0

NJ     3076.0

NC     2961.0

MO     2724.0

MN     2434.0

WA     2412.0

PA     2403.0

MA     2303.0

MI     2300.0

TN     2085.0

OH     2036.0

MD     1924.0

LA     1603.0

WI     1546.0

UT     1412.0

OR     1407.0

OK     1100.0

KY     1049.0

IN      981.0

AR      866.0

NM      845.0

AL      803.0

HI      735.0

SC      731.0

AK      717.0

NE      670.0

IA      595.0

PR      568.0

CT      526.0

ND      389.0

KS      383.0

SD      360.0

ID      328.0

MS      317.0

RI      301.0

MT      270.0

ME      192.0

NH      189.0

WY      188.0

VT      153.0

WV       99.0

VI       73.0

DE       30.0

TT       13.0

Name: ARR_DEL15, dtype: float64

#将上面的数据合并

delay_df = pd.DataFrame([departure_delay_counts, arrival_delay_counts]).T

delay_df

Out[13]:

DEP_DEL15  ARR_DEL15

AK      524.0      717.0

AL      641.0      803.0

AR      789.0      866.0

AZ     3939.0     3572.0

CA    15976.0    17700.0

CO     6042.0     5361.0

CT      293.0      526.0

DE       19.0       30.0

FL     7862.0     7537.0

GA     7799.0     6566.0

HI      496.0      735.0

IA      418.0      595.0

ID      218.0      328.0

IL    12174.0    11529.0

IN      801.0      981.0

KS      265.0      383.0

KY      861.0     1049.0

LA     1473.0     1603.0

MA     1796.0     2303.0

MD     2426.0     1924.0

ME      146.0      192.0

MI     2301.0     2300.0

MN     2369.0     2434.0

MO     2628.0     2724.0

MS      255.0      317.0

MT      164.0      270.0

NC     3138.0     2961.0

ND      245.0      389.0

NE      523.0      670.0

NH       99.0      189.0

NJ     2955.0     3076.0

NM      745.0      845.0

NV     3995.0     3562.0

NY     4732.0     6270.0

OH     1594.0     2036.0

OK      877.0     1100.0

OR     1057.0     1407.0

PA     1999.0     2403.0

PR      434.0      568.0

RI      144.0      301.0

SC      638.0      731.0

SD      250.0      360.0

TN     1916.0     2085.0

TT        9.0       13.0

TX    19016.0    17849.0

UT     1305.0     1412.0

VA     3267.0     3663.0

VI       46.0       73.0

VT      108.0      153.0

WA     2009.0     2412.0

WI     1165.0     1546.0

WV       89.0       99.0

WY      123.0      188.0

#绘制条形图

delay_df.sort_values('DEP_DEL15', ascending=False).plot(kind='bar', title='Number of delayed flights by state')

Out[14]:

data.ORIGIN_STATE_ABR.value_counts()

Out[15]:

CA    64034

TX    61504

IL    34573

FL    34502

GA    32328

NY    22438

CO    21562

AZ    15519

VA    14122

NC    13868

NV    13339

MI    11560

WA    11315

MN    10620

NJ    10045

MA     9829

UT     9791

MO     9432

PA     9424

HI     8364

MD     7996

TN     7221

OH     6915

LA     6007

OR     5935

WI     5088

AK     3642

IN     3527

KY     3517

OK     3444

AL     2671

NM     2610

SC     2577

PR     2540

AR     2535

NE     2052

CT     1871

MT     1716

IA     1643

ID     1589

ND     1456

MS     1147

SD     1096

KS     1071

RI     1007

WY      964

ME      696

NH      601

VT      472

VI      431

WV      294

DE       76

TT       41

Name: ORIGIN_STATE_ABR, dtype: int64

#根据飞机晚点的数量求百分比

pct_departure_delay = departure_delay_counts / data.ORIGIN_STATE_ABR.value_counts()

pct_arrival_delay = arrival_delay_counts / data.DEST_STATE_ABR.value_counts()

pct_departure_delay.sort_values(ascending=False).plot(kind='bar', title='% flights with departure delays by origin state')

Out[16]:

pct_arrival_delay.sort_values(ascending=False).plot(kind='bar', color=colors[1], title='% flights with arrival delay by destination state')

Out[17]:

pct_delay_df = pd.DataFrame([pct_departure_delay, pct_arrival_delay], index=['PCT_DEP_DEL15', 'PCT_ARR_DEL15']).T

pct_delay_df.sort_values('PCT_ARR_DEL15', ascending=False).plot(kind='bar', title='Overlapping % delay plots for comparison')

Out[18]:

#### 机场间延迟趋势 ####

delay_counts_df = data[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'ARR_DEL15']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).sum()

delay_counts_df.head()

Out[19]:

ARR_DEL15

ORIGIN_STATE_ABR DEST_STATE_ABR

AK               AK                  351.0

AZ                    5.0

CA                   11.0

CO                   21.0

GA                    3.0

support = (delay_counts_df / len(data))

support.head()

Out[20]:

ARR_DEL15

ORIGIN_STATE_ABR DEST_STATE_ABR

AK               AK               0.000698

AZ               0.000010

CA               0.000022

CO               0.000042

GA               0.000006

support = support.unstack()

support.head()

Out[21]:

ARR_DEL15                          ...

DEST_STATE_ABR          AK  AL        AR        AZ  ...        WA        WI  WV   WY

ORIGIN_STATE_ABR                                    ...

AK                0.000698 NaN       NaN  0.000010  ...  0.000209       NaN NaN  NaN

AL                     NaN NaN       NaN       NaN  ...       NaN       NaN NaN  NaN

AR                     NaN NaN       NaN  0.000008  ...       NaN       NaN NaN  NaN

AZ                0.000026 NaN  0.000008  0.000129  ...  0.000290  0.000062 NaN  NaN

CA                0.000056 NaN  0.000008  0.001846  ...  0.001423  0.000068 NaN  0.0

[5 rows x 53 columns]

'''

上面的support是有2级index的列(multilevel index),

我们如果想去掉外面的ARR_DEL15可以先转列到行,然后去掉index,

之后在转换回去。最终我们就可以得到一个始发-终到机场的延迟比例

'''

support = support.T.reset_index(level=0, drop=True).T

support.head()

Out[22]:

DEST_STATE_ABR          AK  AL        AR        AZ  ...        WA        WI  WV   WY

ORIGIN_STATE_ABR                                    ...

AK                0.000698 NaN       NaN  0.000010  ...  0.000209       NaN NaN  NaN

AL                     NaN NaN       NaN       NaN  ...       NaN       NaN NaN  NaN

AR                     NaN NaN       NaN  0.000008  ...       NaN       NaN NaN  NaN

AZ                0.000026 NaN  0.000008  0.000129  ...  0.000290  0.000062 NaN  NaN

CA                0.000056 NaN  0.000008  0.001846  ...  0.001423  0.000068 NaN  0.0

[5 rows x 53 columns]

def asymmatplot(plotmat, names=None, cmap="Greys", cmap_range=None, ax=None, **kwargs):

'''

Plot an asymmetric matrix with colormap and statistic values. A modification of the

symmatplot() function in Seaborn to show the upper-half of the matrix.

See https://github.com/mwaskom/seaborn/blob/master/seaborn/linearmodels.py for the original.

'''

if ax is None:

ax = plt.gca()

nvars = len(plotmat)

if cmap_range is None:

vmax = np.nanmax(plotmat) * 1.15

vmin = np.nanmin(plotmat) * 1.15

elif len(cmap_range) == 2:

vmin, vmax = cmap_range

else:

raise ValueError("cmap_range argument not understood")

mat_img = ax.matshow(plotmat, cmap=cmap, vmin=vmin, vmax=vmax, **kwargs)

plt.colorbar(mat_img, shrink=.75)

ax.xaxis.set_ticks_position("bottom")

ax.set_xticklabels(names, rotation=90)

ax.set_yticklabels(names)

minor_ticks = np.linspace(-.5, nvars - 1.5, nvars)

ax.set_xticks(minor_ticks, True)

ax.set_yticks(minor_ticks, True)

major_ticks = np.linspace(0, nvars - 1, nvars)

ax.set_xticks(major_ticks)

ax.set_yticks(major_ticks)

ax.grid(False, which="major")

ax.grid(True, which="minor", linestyle="-")

return ax

fig, ax = plt.subplots(figsize=(18,18))

asymmatplot(support, names=support.columns, ax=ax, cmap='OrRd')

Out[24]:

##求比例

trip_counts_df = data[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'FL_DATE']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).count()

delay_counts_df = delay_counts_df.rename({'ARR_DEL15' : 'COUNTS'}, axis=1)

trip_counts_df = trip_counts_df.rename({'FL_DATE' : 'COUNTS'}, axis=1)

##用延迟航班数量除以总航班数

mat = (delay_counts_df / trip_counts_df).unstack().T.reset_index(level=0, drop=True).T

fig, ax = plt.subplots(figsize=(18,18))

asymmatplot(mat, names=mat.columns, ax=ax, cmap='OrRd', cmap_range=(0., 1.0))

Out[28]:

#### 每日延迟分布情况 ####

fig, ax = plt.subplots(figsize=(18,10))

sns.boxplot(data.ARR_DELAY_NEW, data.FL_DATE, ax=ax)

fig.autofmt_xdate()

fig, ax = plt.subplots(figsize=(18,10))

sns.boxplot(data.ARR_DELAY_NEW, data.FL_DATE, ax=ax, showfliers=False) #showflier

fig.autofmt_xdate()

你可能感兴趣的:(python飞机)