airline

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

plt.rcParams['font.sans-serif']=['SimHei']

plt.rcParams['axes.unicode_minus']=False

df=pd.read_csv('C:/Users/xh/Desktop/air_data.csv')

df.info()

columns=df.columns

for i in columns:

    if df[i].isnull().sum()==0:

        pass

    else:

        print(i,df[i].isnull().sum())

df.loc[df['WORK_PROVINCE']=='北京','WORK_CITY'] = '北京'

df.loc[df['WORK_PROVINCE']=='上海','WORK_CITY'] = '上海'

df.loc[df['WORK_CITY']=='北京','WORK_PROCINCE'] = '北京'

df.loc[df['WORK_CITY']=='上海','WORK_PROCINCE'] = '上海'

df['WORK_CITY'].isnull().sum()

df=df.iloc[:,:-1]

df.describe()

df['FFP_DATE']=pd.to_datetime(df['FFP_DATE'])

df['MONTH']=df['FFP_DATE'].values.astype('datetime64[M]')

bins=[0,10,20,30,40,50,60,110]

df['AGE_CUT']=pd.cut(df['AGE'],bins=bins,labels=['10岁以下','10-20','20-30','30-40','40-50','50-60','60+'])

plt.figure(figsize=(12,8))

plt.subplot(221)

df.groupby('MONTH').MEMBER_NO.count().plot()

plt.subplot(222)

df.groupby('GENDER').MEMBER_NO.count().plot(kind='bar')

plt.subplot(223)

df.groupby('AGE_CUT').MEMBER_NO.count().plot(kind='bar')

plt.subplot(224)

df.groupby('FFP_TIER').MEMBER_NO.count().plot(kind='bar')

df[df['WORK_COUNTRY']=='CN']['WORK_COUNTRY'].count()/df['WORK_COUNTRY'].count()

member=df[['MEMBER_NO','FFP_TIER','SUM_YR_1','SUM_YR_2']]

member=member.dropna()

g=sns.FacetGrid(member,col='FFP_TIER')

g.map(plt.scatter,'MEMBER_NO','SUM_YR_1')

g=sns.FacetGrid(member,col='FFP_TIER')

g.map(plt.scatter,'MEMBER_NO','SUM_YR_2')

plt.figure(figsize=(5,5))

plt.subplot(131)

member[(member['FFP_TIER']==4)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

plt.subplot(132)

member[(member['FFP_TIER']==5)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

plt.subplot(133)

member[(member['FFP_TIER']==6)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

df.groupby('FFP_TIER').FLIGHT_COUNT.mean()

df.groupby('FFP_TIER').AVG_INTERVAL.mean()

data1['AVG_INTERVAL'].mean()

ffp=member['FFP_TIER'].value_counts().index

for i in ffp:

    print(member[member['FFP_TIER']==i]['FFP_TIER'].count()/member['FFP_TIER'].count())

for i in ffp:

    print(member[member['FFP_TIER']==i]['SUM_YR_2'].sum()/member['SUM_YR_2'].sum())#

memb=member.groupby('MEMBER_NO').SUM_YR_2.sum().sort_values().reset_index()

memb['CUMSUM']=memb.SUM_YR_2.cumsum()

memb.tail()

total=max(memb.CUMSUM)

memb['P']=memb['CUMSUM']/total

memb.P.plot()

sns.set_style('darkgrid')

plt.figure(figsize=(25,15))

data=df.dropna()

column=data.columns.tolist()

corr=data[column].corr()

zero=np.zeros_like(corr,dtype=np.bool)

zero[np.triu_indices_from(zero)]=True

sns.heatmap(corr,mask=zero,square=True,annot=True,fmt='.2f')

data1=df.dropna(subset=['SUM'])

data1=data1[data1['SUM']>0]

data=data1[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]

data['LOAD_TIME']=pd.to_datetime(data['LOAD_TIME'])

data['L']=(data['LOAD_TIME']-data['FFP_DATE'])/np.timedelta64(1,'M')

data['R']=data['LAST_TO_END']

data['F']=data['FLIGHT_COUNT']

data['M']=data['SEG_KM_SUM']

data['C']=data['avg_discount']

da=data.iloc[:,6:]

da['L']=round(da['L'],2)

da['L']=da['L'].astype('int')

da=(da-da.mean(axis=0))/da.std(axis=0)

da=np.array(da)

from sklearn.cluster import KMeans

k=5

kmean=KMeans(n_clusters=k)

kmean.fit(da)

kmean.cluster_centers_#查看聚类中心

print(kmean.labels_)

label=list(kmean.labels_)

label=pd.value_counts(label)

pd.value_counts(label).plot(kind='bar',colors=color)

k=5

center=kmean.cluster_centers_

color=['r','g','b','c','y']

x=[1,2,3,4,5]

for i in range(5):

    plt.plot(x,center[i],label='cluster'+str(i)+'  '+str(label[i]),color=color[i],marker='o')

plt.xlabel('LRFMC')

plt.legend()

plt.show()

你可能感兴趣的:(airline)