tSNE—高维数据降维可视化(实践部分)-Kmeans聚类


import pandas as pd
cust_sale=pd.read_excel('C:/Users/XI/fzql.xls')
cust_sale=pd.merge(temp1,temp2,on='CUST_ID',how='inner')
cust_sale=cust_sale.dropna()
cust_sale.head()

import sklearn.preprocessing as preprocessing#方法一
min_max_scaler = preprocessing.MinMaxScaler()
cust_sale.loc[:,['特征一','特征二','特征三','特征四','特征五']]=min_max_scaler.fit_transform(cust_sale[['特征一','特征二','特征三','特征四','特征五']].values)#数据归一化,按比例映射到(01)区间
cust_sale.head()

from sklearn import preprocessing#方法二
cust_sale=cust_sale.dropna()
cust_sale[['特征一','特征二','特征三','特征四','特征五']] = preprocessing.scale(cust_sale[['特征一','特征二','特征三','特征四','特征五']])#指定均值方差按列标准化!(默认mean=0,std=1)
print('mean:', cust_sale[["特征一","特征二"]].mean(axis=0), '\nstd:', cust_sale[["特征一","特征二"]].std(axis=0))
cust_sale.head()




#-------KMeans聚类-------
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
#df_onehot_pred = clf.fit_predict(df_onehot)
RS = 20180101
Silhouette_scores=[]
Calinski_Harabaz_scores=[]
for i in np.arange(2,11):
    kmeans_model = KMeans(n_clusters=i).fit(cust_sale["特征一"].values.reshape(-1,1))   #单个特征要reshape(-1,1)
    labels = kmeans_model.labels_
    Silhouette_scores.append(metrics.silhouette_score(cust_sale["特征一"].values.reshape(-1,1),labels,metric='euclidean'))
    Calinski_Harabaz_scores.append(metrics.calinski_harabaz_score(cust_sale["特征一"].values.reshape(-1,1),labels))
print('Silhouette:%f' %metrics.silhouette_score(cust_sale["特征一"].values.reshape(-1,1),labels,metric='euclidean'))
print('Calinski_Harabaz:%0.3f'% metrics.calinski_harabaz_score(cust_sale["特征一"].values.reshape(-1,1),labels))
pd.DataFrame(Silhouette_scores,index=np.arange(2,11),columns=['metric']).plot(subplots=True,figsize=(10,5),grid=True,title="silhouette_score")
pd.DataFrame(Calinski_Harabaz_scores,index=np.arange(2,11),columns=['metric']).plot(subplots=True,figsize=(10,5),grid=True,title="Calinski_Harabaz_scores")
pd.merge(cust_sale,pd.DataFrame(labels,index=cust_sale.index,columns=['Y']),left_index=True,right_index=True).to_csv('C:/Users/4-6_second.csv', index=None)
plt.show()




print(pd.DataFrame(labels,index=cust_sale.index,columns=['Y'])['Y'].value_counts())
import numpy as np
import sklearn
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import matplotlib
RS = 20190101 #Random state
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted') #调色板颜色温和
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

X = cust_sale.iloc[:,1:6].values
y = labels
digits_proj = TSNE(random_state=RS).fit_transform(X)    #将X降到2维

def scatter(x, colors):
    palette = np.array(sns.color_palette("hls", 5))

    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40,
                    c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    #给类群点加文字说明
    txts = []
    for i in range(5):
        xtext, ytext = np.median(x[colors == i, :], axis=0)    #中心点
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="w"),PathEffects.Normal()])    #线条效果
        txts.append(txt)
    return f, ax, sc, txts

scatter(digits_proj, y)
#plt.savefig('digits_tsne-generated.png', dpi=120)
plt.show()
tSNE—高维数据降维可视化(实践部分)-Kmeans聚类_第1张图片

你可能感兴趣的:(Machine,Learning)