多特征聚类,依据坐标显示

from conf.config import *
mysql = MYSQL
mysql.open()


def read_data_mysql(table_name,n,label,num_features):
    select_sql = "SELECT * FROM %s " % table_name
    # select_sql = "SELECT * FROM %s " % table_name
    record_list = mysql.select(select_sql, "")

    data_list = []

    for record in record_list:
        record = list(record)  #元组转列表
        data_list.append(record)


    x_list = []
    y_list = []
    for line in data_list:
        if label == 1:

            tag = line[n]
            y_list.append(int(tag))  # 标志位

            one_list = [int(o) for o in line[n:num_features]]
            one_list.append(float(line[num_features]))
            x_list.append(one_list)
        else:

            try:
                d_9 = int(line[9])
            except:
                d_9 = 0

            try:
                d_10 = int(line[10])
            except:
                d_10 = 0

            try:
                d_13 = int(line[13])
            except:
                d_13 = 0

            try:
                d_14 = int(line[14])
            except:
                d_14 = 0

            one_list = [d_9,d_10,d_13,d_14]
            x_list.append(one_list)
    return x_list, data_list


# if __name__ == "__main__":
#     main()
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


# 正规化数据集 X
def normalize(X, axis=-1, p=2):
    lp_norm = np.atleast_1d(np.linalg.norm(X, p, axis))

    lp_norm[lp_norm == 0] = 1
    return X / np.expand_dims(lp_norm, axis)


# 计算一个样本与数据集中所有样本的欧氏距离的平方
def euclidean_distance(one_sample, X):
    one_sample = one_sample.reshape(1, -1)

    X = X.reshape(X.shape[0], -1)
    distances = np.power(np.tile(one_sample, (X.shape[0], 1)) - X, 2).sum(axis=1)
    return distances


class Kmeans():
    """Kmeans聚类算法.
    Parameters:
    -----------
    k: int
    聚类的数目.
    max_iterations: int
    最大迭代次数.
    varepsilon: float
    判断是否收敛, 如果上一次的所有k个聚类中心与本次的所有k个聚类中心的差都小于varepsilon,
    则说明算法已经收敛
    """


    def __init__(self, k=2, max_iterations=500, varepsilon=0.0001):
        self.k = k


        self.max_iterations = max_iterations
        self.varepsilon = varepsilon


    # 从所有样本中随机选取self.k样本作为初始的聚类中心
    def init_random_centroids(self, X):
        n_samples, n_features = np.shape(X)


        centroids = np.zeros((self.k, n_features))
        for i in range(self.k):
            centroid = X[np.random.choice(range(n_samples))]
            centroids[i] = centroid
        return centroids


    # 返回距离该样本最近的一个中心索引[0, self.k)
    def _closest_centroid(self, sample, centroids):
        distances = euclidean_distance(sample, centroids)
        closest_i = np.argmin(distances)
        return closest_i


    # 将所有样本进行归类,归类规则就是将该样本归类到与其最近的中心
    def create_clusters(self, centroids, X):
        n_samples = np.shape(X)[0]
        clusters = [[] for _ in range(self.k)]
        for sample_i, sample in enumerate(X):
            centroid_i = self._closest_centroid(sample, centroids)
            clusters[centroid_i].append(sample_i)

            # print(clusters[0])
            # print(clusters[1])
            # print(clusters[2])
            # print(clusters[3])

        return clusters


    # 对中心进行更新
    def update_centroids(self, clusters, X):
        n_features = np.shape(X)[1]

        centroids = np.zeros((self.k, n_features))
        for i, cluster in enumerate(clusters):
            centroid = np.mean(X[cluster], axis=0)
            centroids[i] = centroid
        return centroids


    # 将所有样本进行归类,其所在的类别的索引就是其类别标签
    def get_cluster_labels(self, clusters, X):
        y_pred = np.zeros(np.shape(X)[0])


        for cluster_i, cluster in enumerate(clusters):
            for sample_i in cluster:
                y_pred[sample_i] = cluster_i
        return y_pred


    # 对整个数据集X进行Kmeans聚类,返回其聚类的标签
    def predict(self, X):
        # 从所有样本中随机选取self.k样本作为初始的聚类中心
        centroids = self.init_random_centroids(X)

        # 迭代,直到算法收敛(上一次的聚类中心和这一次的聚类中心几乎重合)或者达到最大迭代次数
        for _ in range(self.max_iterations):
            # 将所有进行归类,归类规则就是将该样本归类到与其最近的中心
            clusters = self.create_clusters(centroids, X)
            former_centroids = centroids

            # 计算新的聚类中心
            centroids = self.update_centroids(clusters, X)

            # 如果聚类中心几乎没有变化,说明算法已经收敛,退出迭代
            diff = centroids - former_centroids
            if diff.any() < self.varepsilon:
                break

            return self.get_cluster_labels(clusters, X)


def main():

    # Load the dataset
    # X, y = datasets.make_blobs(n_samples=10000,
    #                            n_features=3,
    #                            centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]],
    #                            cluster_std=[0.2, 0.1, 0.2, 0.2],
    #                            random_state=9)
    from mysql_data import read_data_mysql
    table_name = "actual.typhoon_route"
    n = 1
    label = 0
    num_features = 5
    X, data_list = read_data_mysql(table_name,n,label,num_features)

    # 列表转ndarray
    X = np.array(X)

    # 用Kmeans算法进行聚类
    clf = Kmeans(k=4)
    y_pred = clf.predict(X)
    cluster_list = []
    for x,y in zip(X,y_pred):
        print(x[0],y )
        if (x[0],y ) not in cluster_list:
            cluster_list.append((x[0],y))
    print("所有台风分类:",cluster_list)

    # 可视化聚类效果
    fig = plt.figure(figsize=(12, 8))
    ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
    plt.scatter(X[y_pred == 0][:, 0], X[y_pred == 0][:, 1], X[y_pred == 0][:, 2], X[y_pred == 0][:, 3])
    plt.scatter(X[y_pred == 1][:, 0], X[y_pred == 1][:, 1], X[y_pred == 1][:, 2], X[y_pred == 1][:, 3])
    plt.scatter(X[y_pred == 2][:, 0], X[y_pred == 2][:, 1], X[y_pred == 2][:, 2], X[y_pred == 2][:, 3])
    plt.scatter(X[y_pred == 3][:, 0], X[y_pred == 3][:, 1], X[y_pred == 3][:, 2], X[y_pred == 3][:, 3])
    plt.show()


    #结合台风路径,打印坐标
    # fig = plt.figure(1,figsize=(12, 8))
    # # ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=10, azim=11)
    #
    # y_pred = y_pred.tolist()
    # n = len(y_pred)
    #
    # color = ['b','g','r','m','orange']
    # for i in range(n):
    #     j = int(y_pred[i])
    #     plt.scatter(data_list[i][5], data_list[i][6], c=color[j])
    #     # print(i)
    #     if i % 1000 == 0:
    #         print(i)
    #         plt.show()
    # plt.show()



if __name__ == "__main__":
    main()

你可能感兴趣的:(IT,python)