多进程辅助聚类小程序

#!/bin/env/python3.6
# coding: utf-8

import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn import metrics

import time
import multiprocessing

class KClusterProcess(multiprocessing.Process):
    """
    Args:
        data: 训练数据
        k:    类别数量
        sd:   进程共享字典
        method: 聚类算法
        scoring: 评价方法
    Return:
        聚类结果存入传入进来的共享字典中
    """
    def __init__(self, data, k, sd, method='KMeans', scoring='CH'):
        multiprocessing.Process.__init__(self)

        self._k = k
        self._data = data
        self.daemon = True
        self._method = method
        self._scoring = scoring
        self._sd = sd

    def run(self):
        start = time.time()
        if self._method == 'KMeans':
            pred = self._do_KMeans()
        elif self._method == 'MiniBatchKMeans':
            pred = self._do_MiniBatchKMeans()
        else:
            raise Exception("no matched method")

        if self._scoring == 'CH':
            score = metrics.calinski_harabaz_score(self._data, pred)
        elif self._scoring == 'SH':
            score = metrics.silhouette_score(self._data, pred)
        else:
            raise Exception("no matched scoring")
       end = time.time()

        self._sd[self._k] = {
            "time_elapsed": end-start,
            "score": score
        }

    def _do_MiniBatchKMeans(self):
        mb_kmeans = MiniBatchKMeans(n_clusters = self._k)
        mb_kmeans.fit(self._data)
        pred = mb_kmeans.predict(self._data)

        return pred

    def _do_KMeans(self):
        kmeans = KMeans(n_clusters = self._k)
        kmeans.fit(self._data)
        pred = kmeans.predict(self._data)

        return pred

#读取训练数据
data = pd.read_csv('./data/users_FE.csv')
Ks = range(10, 30, 10)
sd = multiprocessing.Manager().dict() #进程间共享变量(dict类型)

ps = []
X_data = data.values

for k in Ks:
    p = KClusterProcess(X_data, k, sd, scoring='CH')
    p.start()
    ps.append(p)

for p in ps:
    p.join() #等待所有进程完成处理

for k in Ks:
    print("k:%s,time elapsed:%d,score:%.2f" % (k,sd[k]['time_elapsed'],sd[k]['score']))

你可能感兴趣的:(机器学习)