import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
df=pd.read_csv('C:/Users/xh/Desktop/air_data.csv')
df.info()
columns=df.columns
for i in columns:
if df[i].isnull().sum()==0:
pass
else:
print(i,df[i].isnull().sum())
df.loc[df['WORK_PROVINCE']=='北京','WORK_CITY'] = '北京'
df.loc[df['WORK_PROVINCE']=='上海','WORK_CITY'] = '上海'
df.loc[df['WORK_CITY']=='北京','WORK_PROCINCE'] = '北京'
df.loc[df['WORK_CITY']=='上海','WORK_PROCINCE'] = '上海'
df['WORK_CITY'].isnull().sum()
df=df.iloc[:,:-1]
df.describe()
df['FFP_DATE']=pd.to_datetime(df['FFP_DATE'])
df['MONTH']=df['FFP_DATE'].values.astype('datetime64[M]')
bins=[0,10,20,30,40,50,60,110]
df['AGE_CUT']=pd.cut(df['AGE'],bins=bins,labels=['10岁以下','10-20','20-30','30-40','40-50','50-60','60+'])
plt.figure(figsize=(12,8))
plt.subplot(221)
df.groupby('MONTH').MEMBER_NO.count().plot()
plt.subplot(222)
df.groupby('GENDER').MEMBER_NO.count().plot(kind='bar')
plt.subplot(223)
df.groupby('AGE_CUT').MEMBER_NO.count().plot(kind='bar')
plt.subplot(224)
df.groupby('FFP_TIER').MEMBER_NO.count().plot(kind='bar')
df[df['WORK_COUNTRY']=='CN']['WORK_COUNTRY'].count()/df['WORK_COUNTRY'].count()
member=df[['MEMBER_NO','FFP_TIER','SUM_YR_1','SUM_YR_2']]
member=member.dropna()
g=sns.FacetGrid(member,col='FFP_TIER')
g.map(plt.scatter,'MEMBER_NO','SUM_YR_1')
g=sns.FacetGrid(member,col='FFP_TIER')
g.map(plt.scatter,'MEMBER_NO','SUM_YR_2')
plt.figure(figsize=(5,5))
plt.subplot(131)
member[(member['FFP_TIER']==4)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)
plt.subplot(132)
member[(member['FFP_TIER']==5)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)
plt.subplot(133)
member[(member['FFP_TIER']==6)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)
df.groupby('FFP_TIER').FLIGHT_COUNT.mean()
df.groupby('FFP_TIER').AVG_INTERVAL.mean()
data1['AVG_INTERVAL'].mean()
ffp=member['FFP_TIER'].value_counts().index
for i in ffp:
print(member[member['FFP_TIER']==i]['FFP_TIER'].count()/member['FFP_TIER'].count())
for i in ffp:
print(member[member['FFP_TIER']==i]['SUM_YR_2'].sum()/member['SUM_YR_2'].sum())#
memb=member.groupby('MEMBER_NO').SUM_YR_2.sum().sort_values().reset_index()
memb['CUMSUM']=memb.SUM_YR_2.cumsum()
memb.tail()
total=max(memb.CUMSUM)
memb['P']=memb['CUMSUM']/total
memb.P.plot()
sns.set_style('darkgrid')
plt.figure(figsize=(25,15))
data=df.dropna()
column=data.columns.tolist()
corr=data[column].corr()
zero=np.zeros_like(corr,dtype=np.bool)
zero[np.triu_indices_from(zero)]=True
sns.heatmap(corr,mask=zero,square=True,annot=True,fmt='.2f')
data1=df.dropna(subset=['SUM'])
data1=data1[data1['SUM']>0]
data=data1[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
data['LOAD_TIME']=pd.to_datetime(data['LOAD_TIME'])
data['L']=(data['LOAD_TIME']-data['FFP_DATE'])/np.timedelta64(1,'M')
data['R']=data['LAST_TO_END']
data['F']=data['FLIGHT_COUNT']
data['M']=data['SEG_KM_SUM']
data['C']=data['avg_discount']
da=data.iloc[:,6:]
da['L']=round(da['L'],2)
da['L']=da['L'].astype('int')
da=(da-da.mean(axis=0))/da.std(axis=0)
da=np.array(da)
from sklearn.cluster import KMeans
k=5
kmean=KMeans(n_clusters=k)
kmean.fit(da)
kmean.cluster_centers_#查看聚类中心
print(kmean.labels_)
label=list(kmean.labels_)
label=pd.value_counts(label)
pd.value_counts(label).plot(kind='bar',colors=color)
k=5
center=kmean.cluster_centers_
color=['r','g','b','c','y']
x=[1,2,3,4,5]
for i in range(5):
plt.plot(x,center[i],label='cluster'+str(i)+' '+str(label[i]),color=color[i],marker='o')
plt.xlabel('LRFMC')
plt.legend()
plt.show()