MiniBatchKMeans简单应用

MiniBatchKMeans比KMeans快很多,效果也不错,应用于文本聚类如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function

import logging
import os
import re
from collections import defaultdict
from time import time

import jieba
from gensim.utils import to_utf8
from six.moves import xrange
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer

logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


def load_stopwords():
    # path = '/Users/fhqplzj/github/HanLP/data/dictionary/stopwords.txt'
    path = '/data/zhaojun/local_projects/stopwords.txt'
    return frozenset(open(path, 'rb').read().decode('utf-8').splitlines())


# 停用词
stopwords = load_stopwords()
chinese = re.compile(ur'^[\u4e00-\u9fa5]+$')


def chinese_non_stopwords(word):
    # 全是中文,并且不是停用词
    result = True if re.match(chinese, word) else False
    return result and word not in stopwords


def sentence_tokenizer(sentence):
    # 分词,过滤
    try:
        content = sentence.strip().split('\t', 1)[1]
    except IndexError:
        content = u'呢'
    return filter(chinese_non_stopwords, jieba.lcut(content))


def load_documents(path):
    # path = '/Users/fhqplzj/Downloads/part-' + path
    path = '/data/zhaojun/part100/part-' + path
    logger.info('processing file: %s' % path)
    return open(path, 'rb').read().decode('utf-8').splitlines()


if __name__ == '__main__':
    file_names = map(lambda i: '{:05d}'.format(i), xrange(100))
    docs = []
    for file_name in file_names:
        docs.extend(load_documents(file_name))

    t0 = time()
    logger.info('TfidfVectorizer...')
    vectorizer = TfidfVectorizer(tokenizer=sentence_tokenizer, min_df=5, max_df=0.1)
    X = vectorizer.fit_transform(docs)
    logger.info('vectorizer: %fs' % (time() - t0))

    t0 = time()
    logger.info('MiniBatchKMeans...')
    km = MiniBatchKMeans(n_clusters=100, batch_size=1000)
    km.fit(X)
    logger.info('kmeans: %fs' % (time() - t0))

    t0 = time()
    logger.info('collecting result')
    pred_labels = km.labels_
    result = defaultdict(list)
    for idx in xrange(len(pred_labels)):
        result[pred_labels[idx]].append(docs[idx])
    for k in result:
        name = 'res-{:05d}'.format(k)
        elems = result[k]
        out_path = os.path.join('/tmp/cluster', name)
        with open(out_path, 'w') as fout:
            logger.info('writing %s' % out_path)
            for elem in elems:
                fout.write(to_utf8(elem) + '\n')
    logger.info('finished: %fs' % (time() - t0))
    # sorted_indices = km.cluster_centers_.argsort()[:, ::-1]
    # id2words = vectorizer.get_feature_names()
    # for i in range(km.n_clusters):
    #     print('cluster: %i' % i)
    #     for idx in sorted_indices[i, :10]:
    #         print(' %s' % id2words[idx])
    #     print()


你可能感兴趣的:(MiniBatchKMeans简单应用)