Python K中心点聚类kmediods聚类代码

import numpy as np
from numpy import *
import sys
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt

def fit_plot_Kmean_model(X):
    data = pd.read_csv(X, header=None, error_bad_lines=False)
    # 必须添加header=None,否则默认把第一行数据处理成列名导致缺失
    data_list = data.values.tolist()
    data_list = np.array(data_list)
    return data_list


def choice_center(data,k):
    k=int(k)
    centers = []
    for i in np.random.choice(len(data), k):
        centers.append(data[i])
    # print("随机选取的中心点(第一次):\n", centers)
    return centers


def distance(a, b):
    dis = []
    for i in range(len(a)):
        dis.append(pow(a[i] - b[i], 2))
    return sqrt(sum(dis))


def k_center(data_list,center,n_clusters,savePath):
    flag = True
    i = 0
    a=len(data_list[0])-1
    n_clusters=int(n_clusters)
    while flag:
        flag = False
        for i in range(len(data_list)):                       # 遍历所有样本,最后一列标记该样本所属簇
            min_index = -2
            min_dis = inf
            for j in range(len(center)):
                dis = distance(data_list[i][1:n_clusters],center[j][1:n_clusters])
                if dis < min_dis:
                    min_dis = dis
                    min_index = j
            if data_list[i][-1] != min_index:
                flag = True
            data_list[i][-1] = min_index
        # print("分类结果111:",data_list)
        # 重新计算簇中心
        for k in range(len(center)):                      # 遍历中心向量,取出属于当前中心向量簇的样本
            current_k = []
            for i in range(len(data_list)):
                if data_list[i][-1] == k:
                    current_k.append(data_list[i])
#            print(k, ":", current_k)
            old_dis = 0.0
            for i in range(len(current_k)):
                old_dis += distance(current_k[i][1:n_clusters], center[k][1:n_clusters])
            for m in range(len(current_k)):
                new_dis = 0.0
                for n in range(len(current_k)):
                    new_dis += distance(current_k[m][1:n_clusters], current_k[n][1:n_clusters])
                if new_dis < old_dis:
                    old_dis = new_dis
                    center[k][:] = current_k[m][:]

    # print("选中的最终中心点", center)
    for i in range(len(data_list)):  # 遍历所有样本,最后一列标记该样本所属簇
        min_index = -2
        min_dis = inf
        for j in range(len(center)):
            dis = distance(data_list[i][1:n_clusters], center[j][1:n_clusters])
            if dis < min_dis:
                min_dis = dis
                min_index = j
        data_list[i][-1] = min_index
    # print("分类结果222:", data_list)

    y_pred=data_list[:,a]
    # print("y_pred:", y_pred)
    score=metrics.calinski_harabaz_score(data_list,y_pred)
    plt.figure(figsize=(12, 12))
    plt.xticks(())
    plt.yticks(())
    plt.subplot(221)
    plt.scatter(data_list[:, 0], data_list[:, 1])
    plt.title("Raw data")
    plt.subplot(222)
    plt.scatter(data_list[:, 0], data_list[:, 1], c=y_pred)
    plt.title("Kmediods:k={},score={}".format(n_clusters, int(score)))
    # plt.title("DBSCAN:k={},eps={},min_samples={}".format(n_clusters, eps, min_samples))

    # 字典中的key值即为csv中列名
    dataframe = pd.DataFrame(y_pred)

    # 将DataFrame存储为csv,index表示是否显示行名,default=True
    savePath = savePath + "\\Kmediods_result.csv"
    dataframe.to_csv(savePath, header=False, index=False, sep=',')
    plt.show()
    a = 'OK'
    return a


if __name__ == '__main__':
    data_list=fit_plot_Kmean_model(sys.argv[2])
    # 1为选取的中心点个数,2为文件打开路径,3为文件保存路径
    centers = choice_center(data_list,sys.argv[1])
    print(k_center(data_list, centers,sys.argv[1],sys.argv[3]))

    # data_list = fit_plot_Kmean_model('C:\\Users\\upc\\Desktop\\testdata.csv')
    # # 1为选取的中心点个数,2为文件打开路径,3为文件保存路径
    # centers = choice_center(data_list, 4)
    # print(k_center(data_list, centers,4, 'C:\\Users\\upc\\Desktop'))

    # 1为选取的中心点个数,2为文件打开路径,3为文件保存路径
    # print(k_center(fit_plot_Kmean_model(sys.argv[2]), choice_center(fit_plot_Kmean_model(sys.argv[2]),sys.argv[1]),sys.argv[1],sys.argv[3]))


    # print(k_center(fit_plot_Kmean_model('C:\\Users\\upc\\Desktop\\testdata.csv'), choice_center(fit_plot_Kmean_model('C:\\Users\\upc\\Desktop\\testdata.csv'),4),4,'C:\\Users\\upc\\Desktop'))
         hiddenimports=['pandas','cython','numpy', 'sklearn','sklearn.metrics','sklearn.metrics.get_scorer','sklearn.metrics.cluster','sklearn.ensemble','sklearn.neighbors.typedefs','sklearn.neighbors.quad_tree','sklearn.tree._utils','scipy._lib.messagestream','email.mime.message', 'email.mime.image', 'email.mime.multipart', 'email.mime.audio', 'email.mime.text'],

你可能感兴趣的:(机器学习,数据挖掘)