from sklearn.cluster import KMeans #导入K均值聚类算法
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
os.chdir(r'')
datafile= r'air_data-utf8.csv' #原始数据文件
data = pd.read_csv(datafile,encoding='utf-8',header=0) #读取原始数据
print(data.columns)
Index(['MEMBER_NO', 'FFP_DATE', 'FIRST_FLIGHT_DATE', 'GENDER', 'FFP_TIER',
'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY', 'AGE', 'LOAD_TIME',
'FLIGHT_COUNT', 'BP_SUM', 'EP_SUM_YR_1', 'EP_SUM_YR_2', 'SUM_YR_1',
'SUM_YR_2', 'SEG_KM_SUM', 'WEIGHTED_SEG_KM', 'LAST_FLIGHT_DATE',
'AVG_FLIGHT_COUNT', 'AVG_BP_SUM', 'BEGIN_TO_FIRST', 'LAST_TO_END',
'AVG_INTERVAL', 'MAX_INTERVAL', 'ADD_POINTS_SUM_YR_1',
'ADD_POINTS_SUM_YR_2', 'EXCHANGE_COUNT', 'avg_discount',
'P1Y_Flight_Count', 'L1Y_Flight_Count', 'P1Y_BP_SUM', 'L1Y_BP_SUM',
'EP_SUM', 'ADD_Point_SUM', 'Eli_Add_Point_Sum', 'L1Y_ELi_Add_Points',
'Points_Sum', 'L1Y_Points_Sum', 'Ration_L1Y_Flight_Count',
'Ration_P1Y_Flight_Count', 'Ration_P1Y_BPS', 'Ration_L1Y_BPS',
'Point_NotFlight'],
dtype='object')
print(data.head())
MEMBER_NO FFP_DATE FIRST_FLIGHT_DATE GENDER FFP_TIER WORK_CITY \
0 54993 2006/11/02 2008/12/24 男 6 .
1 28065 2007/02/19 2007/08/03 男 6 NaN
2 55106 2007/02/01 2007/08/30 男 6 .
3 21189 2008/08/22 2008/08/23 男 5 Los Angeles
4 39546 2009/04/10 2009/04/15 男 6 贵阳
WORK_PROVINCE WORK_COUNTRY AGE LOAD_TIME ... \
0 北京 CN 31.0 2014/03/31 ...
1 北京 CN 42.0 2014/03/31 ...
2 北京 CN 40.0 2014/03/31 ...
3 CA US 64.0 2014/03/31 ...
4 贵州 CN 48.0 2014/03/31 ...
ADD_Point_SUM Eli_Add_Point_Sum L1Y_ELi_Add_Points Points_Sum \
0 39992 114452 111100 619760
1 12000 53288 53288 415768
2 15491 55202 51711 406361
3 0 34890 34890 372204
4 22704 64969 64969 338813
L1Y_Points_Sum Ration_L1Y_Flight_Count Ration_P1Y_Flight_Count \
0 370211 0.509524 0.490476
1 238410 0.514286 0.485714
2 233798 0.518519 0.481481
3 186100 0.434783 0.565217
4 210365 0.532895 0.467105
Ration_P1Y_BPS Ration_L1Y_BPS Point_NotFlight
0 0.487221 0.512777 50
1 0.489289 0.510708 33
2 0.481467 0.518530 26
3 0.551722 0.448275 12
4 0.469054 0.530943 39
[5 rows x 44 columns]
data = data[data['SUM_YR_1'].notnull()&data['SUM_YR_2'].notnull()] #票价非空值才保留
#只保留票价非零的,或者平均折扣率与总飞行公里数同时为0的记录。
index1 = data['SUM_YR_1'] != 0
index2 = data['SUM_YR_2'] != 0
index3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0) #该规则是“与”
data = data[index1 | index2 | index3] #该规则是“或”
#data.LOAD_TIME #观测窗口
#data.FFP_DATE #会员时间
#data.LAST_TO_END #最后一次的距离观测窗口的时长
#data.FLIGHT_COUNT #累计飞行次数
#data.SEG_KM_SUM #累计飞行里程
data.LOAD_TIME=pd.to_datetime(data.LOAD_TIME)
data.FFP_DATE=pd.to_datetime(data.FFP_DATE)
data['Lassociator']=data.LOAD_TIME-data.FFP_DATE
data.Lassociator=pd.to_numeric(data.Lassociator)
data_selected = data[['Lassociator','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM']]
data_selected.dtypes
Lassociator int64
LAST_TO_END int64
FLIGHT_COUNT int64
SEG_KM_SUM int64
dtype: object
data_mean=data_selected.mean(axis = 0)
data_std=data_selected.std(axis = 0)
#column_select=list(data_mean.index)
data_std_scale=(data_selected-data_mean)/data_std
k = 3 #需要进行的聚类类别数
iteration = 1000 #聚类最大循环数
data_std_scale=data_std_scale.replace(np.nan, 0)
kmodel = KMeans(n_clusters = k, n_jobs = 8) #n_jobs是并行数,一般等于CPU数较好
kmodel.fit(data_std_scale) #训练模型
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=8, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
r1 = pd.Series(kmodel.labels_).value_counts() #统计各个类别的数目
r2 = pd.DataFrame(kmodel.cluster_centers_) #找出聚类中心
r = pd.concat([r2, r1], axis = 1)
r.columns = list(data_std_scale.columns) + [u'类别数目']
r = pd.concat([data_std_scale, pd.Series(kmodel.labels_, index = data_std_scale.index)], axis = 1) #详细输出每个样本对应的类别
r.columns = list(data_std_scale.columns) + [u'聚类类别'] #重命名表头
print(r.head())
Lassociator LAST_TO_END FLIGHT_COUNT SEG_KM_SUM 聚类类别
0 1.435707 -0.944948 14.034016 26.761154 2
1 1.307152 -0.911894 9.073213 13.126864 2
2 1.328381 -0.889859 8.718869 12.653481 2
3 0.658476 -0.416098 0.781585 12.540622 2
4 0.386032 -0.922912 9.923636 13.898736 2
for i in range(k):
#data_std_scale[r[u'聚类类别']==i].plot(kind='kde', linewidth = 2, subplots = True, sharex = False,layout=(1,data_std_scale.shape[1]),figsize=(16,2))
data_std_scale[r[u'聚类类别']==i].plot(kind='hist', linewidth = 2, subplots = True, sharex = False,layout=(1,data_std_scale.shape[1]),figsize=(16,2))
plt.legend()
plt.show()
nrsy
数据