Python聚类分析

Python聚类分析

导入类模块

from sklearn.cluster import KMeans #导入K均值聚类算法
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
os.chdir(r'')
datafile= r'air_data-utf8.csv' #原始数据文件

读数据

data = pd.read_csv(datafile,encoding='utf-8',header=0) #读取原始数据

查看数据列

print(data.columns)
Index(['MEMBER_NO', 'FFP_DATE', 'FIRST_FLIGHT_DATE', 'GENDER', 'FFP_TIER',
       'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY', 'AGE', 'LOAD_TIME',
       'FLIGHT_COUNT', 'BP_SUM', 'EP_SUM_YR_1', 'EP_SUM_YR_2', 'SUM_YR_1',
       'SUM_YR_2', 'SEG_KM_SUM', 'WEIGHTED_SEG_KM', 'LAST_FLIGHT_DATE',
       'AVG_FLIGHT_COUNT', 'AVG_BP_SUM', 'BEGIN_TO_FIRST', 'LAST_TO_END',
       'AVG_INTERVAL', 'MAX_INTERVAL', 'ADD_POINTS_SUM_YR_1',
       'ADD_POINTS_SUM_YR_2', 'EXCHANGE_COUNT', 'avg_discount',
       'P1Y_Flight_Count', 'L1Y_Flight_Count', 'P1Y_BP_SUM', 'L1Y_BP_SUM',
       'EP_SUM', 'ADD_Point_SUM', 'Eli_Add_Point_Sum', 'L1Y_ELi_Add_Points',
       'Points_Sum', 'L1Y_Points_Sum', 'Ration_L1Y_Flight_Count',
       'Ration_P1Y_Flight_Count', 'Ration_P1Y_BPS', 'Ration_L1Y_BPS',
       'Point_NotFlight'],
      dtype='object')

查看数据

print(data.head())
   MEMBER_NO    FFP_DATE FIRST_FLIGHT_DATE GENDER  FFP_TIER    WORK_CITY  \
0      54993  2006/11/02        2008/12/24      男         6            .   
1      28065  2007/02/19        2007/08/03      男         6          NaN   
2      55106  2007/02/01        2007/08/30      男         6            .   
3      21189  2008/08/22        2008/08/23      男         5  Los Angeles   
4      39546  2009/04/10        2009/04/15      男         6           贵阳   

  WORK_PROVINCE WORK_COUNTRY   AGE   LOAD_TIME       ...         \
0            北京           CN  31.0  2014/03/31       ...          
1            北京           CN  42.0  2014/03/31       ...          
2            北京           CN  40.0  2014/03/31       ...          
3            CA           US  64.0  2014/03/31       ...          
4            贵州           CN  48.0  2014/03/31       ...          

   ADD_Point_SUM  Eli_Add_Point_Sum  L1Y_ELi_Add_Points  Points_Sum  \
0          39992             114452              111100      619760   
1          12000              53288               53288      415768   
2          15491              55202               51711      406361   
3              0              34890               34890      372204   
4          22704              64969               64969      338813   

   L1Y_Points_Sum  Ration_L1Y_Flight_Count  Ration_P1Y_Flight_Count  \
0          370211                 0.509524                 0.490476   
1          238410                 0.514286                 0.485714   
2          233798                 0.518519                 0.481481   
3          186100                 0.434783                 0.565217   
4          210365                 0.532895                 0.467105   

   Ration_P1Y_BPS Ration_L1Y_BPS  Point_NotFlight  
0        0.487221       0.512777               50  
1        0.489289       0.510708               33  
2        0.481467       0.518530               26  
3        0.551722       0.448275               12  
4        0.469054       0.530943               39  

[5 rows x 44 columns]

数据过滤预处理

data = data[data['SUM_YR_1'].notnull()&data['SUM_YR_2'].notnull()] #票价非空值才保留

#只保留票价非零的,或者平均折扣率与总飞行公里数同时为0的记录。
index1 = data['SUM_YR_1'] != 0
index2 = data['SUM_YR_2'] != 0
index3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0) #该规则是“与”
data = data[index1 | index2 | index3] #该规则是“或”

数据预处理

#data.LOAD_TIME     #观测窗口
#data.FFP_DATE      #会员时间
#data.LAST_TO_END   #最后一次的距离观测窗口的时长
#data.FLIGHT_COUNT  #累计飞行次数
#data.SEG_KM_SUM    #累计飞行里程
data.LOAD_TIME=pd.to_datetime(data.LOAD_TIME)
data.FFP_DATE=pd.to_datetime(data.FFP_DATE)
data['Lassociator']=data.LOAD_TIME-data.FFP_DATE
data.Lassociator=pd.to_numeric(data.Lassociator)
data_selected = data[['Lassociator','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM']]

查看数据类型

data_selected.dtypes
Lassociator     int64
LAST_TO_END     int64
FLIGHT_COUNT    int64
SEG_KM_SUM      int64
dtype: object

数据标准化

data_mean=data_selected.mean(axis = 0)
data_std=data_selected.std(axis = 0)
#column_select=list(data_mean.index)
data_std_scale=(data_selected-data_mean)/data_std

聚类参数设置

k = 3                       #需要进行的聚类类别数
iteration = 1000             #聚类最大循环数

聚类模型

data_std_scale=data_std_scale.replace(np.nan, 0)
kmodel = KMeans(n_clusters = k, n_jobs = 8) #n_jobs是并行数,一般等于CPU数较好
kmodel.fit(data_std_scale) #训练模型

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=8, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

聚类结果处理

r1 = pd.Series(kmodel.labels_).value_counts()  #统计各个类别的数目
r2 = pd.DataFrame(kmodel.cluster_centers_)     #找出聚类中心
r = pd.concat([r2, r1], axis = 1)
r.columns = list(data_std_scale.columns) + [u'类别数目']
r = pd.concat([data_std_scale, pd.Series(kmodel.labels_, index = data_std_scale.index)], axis = 1)  #详细输出每个样本对应的类别
r.columns = list(data_std_scale.columns) + [u'聚类类别'] #重命名表头
print(r.head())
   Lassociator  LAST_TO_END  FLIGHT_COUNT  SEG_KM_SUM  聚类类别
0     1.435707    -0.944948     14.034016   26.761154     2
1     1.307152    -0.911894      9.073213   13.126864     2
2     1.328381    -0.889859      8.718869   12.653481     2
3     0.658476    -0.416098      0.781585   12.540622     2
4     0.386032    -0.922912      9.923636   13.898736     2

聚类结果分析

for i in range(k):
    #data_std_scale[r[u'聚类类别']==i].plot(kind='kde', linewidth = 2, subplots = True, sharex = False,layout=(1,data_std_scale.shape[1]),figsize=(16,2))
    data_std_scale[r[u'聚类类别']==i].plot(kind='hist', linewidth = 2, subplots = True, sharex = False,layout=(1,data_std_scale.shape[1]),figsize=(16,2))
    plt.legend()
plt.show()

Python聚类分析_第1张图片

nrsy
数据

你可能感兴趣的:(Python)