1、处理缺失、异常值
缺失值直接补0
异常值可以设置一个阈值,比喻小于数据的1分位数,或者大于95分位数,就把数据进行四舍五入,用相应的分位数赋值,这样可以减少异常值对于聚类的影响。因为聚类一般计算的是距离,有异常值影响会比较大
2、最大最小归一化,消除特征量纲影响,或者标准化也可以
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
feature_minmax = min_max_scaler.fit_transform(np.array(feature))
3、如果特征维度较多,存在共线性,可以用pca进行降维
def get_pca(data_array):
tmp_std = StandardScaler().fit_transform(data_array)
pca = PCA(n_components=0.9)
tmp_principalComponents = pca.fit_transform(tmp_std)
min_max_scaler = MinMaxScaler()
tmp_principalComponents_1=min_max_scaler.fit_transform(tmp_principalComponents)
print('pca保留的维度:', tmp_principalComponents_1.shape)
return tmp_principalComponents_1
#检验特征的相关性
plt.rcParams['figure.figsize'] = (20, 15)
sns.heatmap(pd.DataFrame(data[:,0:50]).corr(), annot = True, linewidths=.5, cmap="YlGnBu")
plt.title('Correlation between features', fontsize = 30)
plt.tight_layout()
比较重要的是选择合适的类别K
#聚类
from sklearn.cluster import MiniBatchKMeans
ks = range(1, 10)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
#model = MiniBatchKMeans(n_clusters=k, init = 'k-means++', random_state = 123, batch_size = 512, max_iter=200)
model = KMeans(n_clusters=k, init = 'k-means++', random_state = 123)
# Fit model to samples
model.fit(data)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
这里的肘部K值仅供参考,聚类的类别数需要结合业务需要来定
通过图像可以看出各个类别之间的关系,以及选择的K值是否合适
#先用tsne降维
#看看聚类的结果分布情况,每个类随机抽1000个用户画图
import random
def get_tsne_data(data_df, kmean_num, data_feature):
sample_num = 1000
sample_label = []
for i in range(kmean_num):
if len(data_df[data_df['label']==i])>1000:
sample_label.append(random.sample(data_df[data_df['label']==i].index.tolist(),sample_num))
else:
sample_label.append(data_df[data_df['label']==i].index.tolist())
#展平
sample_label_list = list(chain.from_iterable(sample_label))
#tsne降维
tsne=TSNE(n_components = 2).fit_transform(data_feature[sample_label_list]) #进行数据降维,降成两维
tmp_data_concat = pd.DataFrame(tsne, columns=['l1','l2'])
tmp_data_concat['labels'] = data_df['label'][sample_label_list].tolist()
return tmp_data_concat
#画图
def get_picture_kmeans(tmp_tsne_data, filename):
%matplotlib inline
#set font size of labels on matplotlib plots
plt.rc('font', size=16)
#set style of plots
sns.set_style('white')
#create a new figure
plt.figure(figsize=(8,8))
maker=['o','v','^','s','p','*','<','>','D','d','h','H']#设置散点形状
colors = ['black','tomato','cyan','blue', 'lime', 'r', 'violet','m','peru','olivedrab','hotpink']#设置散点颜色
#loop through labels and plot each cluster
for i, label in enumerate(tmp_tsne_data['labels']):
#add data points
#print(label)
plt.scatter(x=tmp_tsne_data.loc[tmp_tsne_data['labels']==label,'l1'], y=tmp_tsne_data.loc[tmp_tsne_data['labels']==label,'l2'],color=colors[label], s=1)
#add label
plt.annotate(label,
tmp_tsne_data.loc[tmp_tsne_data['labels']==label,('l1','l2')].mean(),
horizontalalignment='center',
verticalalignment='center',
size=20, weight='bold',color=colors[-label])
plt.savefig(filename)
def get_picture_kmeans_label(tmp_tsne_data, filename):
#plot data with seaborn (don't add a legend yet)
plt.figure(figsize=(20,20))
facet = sns.lmplot(data=tmp_tsne_data, x='l1', y='l2', hue='labels', fit_reg=False, legend=False)
#add a legend
leg = facet.ax.legend(bbox_to_anchor=[1, 0.75],
title="label", fancybox=True)
#change colors of labels
colors = ['black','tomato','cyan','blue', 'lime', 'r', 'violet','m','peru','olivedrab','hotpink']#设置散点颜色
for i, text in enumerate(leg.get_texts()):
plt.setp(text, color = colors[i])
plt.savefig(filename)
可以选择跟业务强相关的指标,对用户的聚类结果进行刻画,看看每个用户群有哪些突出的特点。类别型指标可以计算TGI的值,连续型可以计算指标的均值、分位数等,比较用户群之间的差异。