k-means算法(航空分析)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#一.缺失值与异常值处理

data=pd.read_csv(r'air_data.csv',encoding='gb18030')
print(data.shape)
#删除票价为空的值
data.dropna(axis=0,how='any',subset=['SUM_YR_1','SUM_YR_2'],inplace=True)
print(data.shape)
#删除票价为0的值
mask=(data['SUM_YR_1']==0)&(data['SUM_YR_2']==0)
labels=data.index[mask]
data.drop(labels=labels,inplace=True,axis=0)
print(data.shape)
'''
(62988, 44)
(62299, 44)
(62044, 44)
'''



#二.提取五大特征
#(1)会员入会时间距观测窗口结束的月数 L=观测窗口的结束时间一入会时间(单位月
# L=LOAD TIME-FFP DATE
FFP_DATE=pd.to_datetime(data['FFP_DATE']).dt.date
LOAD_TIME=pd.to_datetime(data['LOAD_TIME']).dt.date
# print(FFP_DATE)
# L1=(LOAD_TIME-FFP_DATE)
# print(L1)
L=(LOAD_TIME-FFP_DATE)/30
# print(L)
'''
0       90 days 04:48:00
1       86 days 13:36:00
2       87 days 04:00:00

'''
data['L']=L.dt.days
# print(data['L'])
'''
0        90
1        86
2        87
'''
# (2)客户最近一次乘坐公司飞机距观测窗口结束的月数 R=最后一次乘机时间至观察窗口 末端时长(单位:月),
#R = RELAST TO END
LAST_TO_END=data['LAST_TO_END']//30
# print(LAST_TO_END)
data['R']=LAST_TO_END
# print(data['R'])
'''
/30

0         0.033333
1         0.233333
2         0.366667
3         3.233333
'''
'''
//30
0         0
1         0
2         0
3         3

'''
# (3)客户在观测窗 口内乘坐公司 飞机的次数
# F=FLIGHT-COUNT
FLIGHT_COUNT=data['FLIGHT_COUNT']
data['F']=FLIGHT_COUNT
#(4)客户在观测窗口内飞行里程 M= 观测窗口总飞行千米数(单位:千米)
# M=SEG_KM_SUM
SEG_KM_SUM=data['SEG_KM_SUM']
data['M']=SEG_KM_SUM
#(5)客户在观测窗口乘坐舱位对应的折扣系数的平均值 c=平均折扣率(单位:无)
# C=AVG_DISCOUNT
avg_discount=data['avg_discount']
data['C']=avg_discount
# print(data)



#三.数据标准化
def data_scal(data):
    data['L标']=(data['L']-data['L'].min())/(data['L'].max()-data['L'].min())
    data['R标']=(data['R']-data['R'].min())/(data['R'].max()-data['R'].min())
    data['F标']=(data['F']-data['F'].min())/(data['F'].max()-data['F'].min())
    data['M标'] =(data['M']-data['M'].min())/(data['M'].max()-data['M'].min())
    data['C标'] =(data['C']-data['C'].min())/(data['C'].max()-data['C'].min())
    return data
data=data_scal(data)




#四.模块化 K_means算法聚类
from sklearn.cluster import KMeans
x=data[['L标','R标','F标','M标','C标']]
kms=KMeans(n_clusters=5)
y=kms.fit_predict(x)
data['index1']=y  #分类索引
# print(y)
center=data[['L标','R标','F标','M标','C标','index1']].groupby(by='index1').mean()  #分类取类中心
center['L标2']=center['L标']
print(center)
'''
              L标        R标        F标        M标        C标       L标2
index1                                                            
0       0.155161  0.615177  0.010573  0.011052  0.420774  0.155161
1       0.775075  0.083877  0.078814  0.045059  0.450950  0.775075
2       0.125223  0.106254  0.046397  0.028963  0.418198  0.125223
3       0.440348  0.097372  0.060490  0.035447  0.435148  0.440348
4       0.640012  0.598472  0.012530  0.011577  0.432559  0.640012
'''
# print(data)


#画雷达图:
plt.rcParams['font.sans-serif'] = 'SimHei'##仿宋
plt.rcParams['axes.unicode_minus'] = False ##设置正常显示符号
plt.figure()
dataLength=5
angles=np.linspace(0,2*np.pi,dataLength,endpoint=False)
angles2=np.concatenate((angles,np.array([angles[0]]))) #闭合
# print(angles2)
labels = ['L标','R标','F标','M标','C标']
for i in range(5):
    plt.polar(angles2,center.values[i])
    plt.fill(angles2,center.values[i],alpha=0.25)  #填充颜色
plt.xticks(angles,labels)
plt.show()

你可能感兴趣的:(数据分析)