gensim: http://radimrehurek.com/gensim/index.html
Gensim is a free Python framework designed to automatically extract semantic topics from documents, as efficiently (computer-wise) and painlessly (human-wise) as possible.
sudo apt-get install python-numpy python-scipy
pip install gensim
先准备数据,我爬了约2w篇豆瓣日记作为这次试验的数据,数据和代码可以在这里https://github.com/largetalk/yaseg 找到。
主要代码如下:
import jieba
from gensim import corpora, models, similarities
import os
import random
from pprint import pprint
import string
import re
RESULT_DIR = 'douban_result'
regex = re.compile(ur"[^\u4e00-\u9f5aa-zA-Z0-9]")
class DoubanDoc(object):
def __init__(self, root_dir='douban'):
self.root_dir = root_dir
def __iter__(self):
for name in os.listdir(self.root_dir):
if os.path.isfile(os.path.join(self.root_dir, name)):
data = open(os.path.join(self.root_dir, name), 'rb').read()
title = data[:data.find('\r\n')]
yield (name, title, data)
class DoubanCorpus(object):
def __init__(self, root_dir, dictionary):
self.root_dir = root_dir
self.dictionary = dictionary
def __iter__(self):
for name, title, data in DoubanDoc(self.root_dir):
yield self.dictionary.doc2bow(jieba.cut(data, cut_all=False))
def random_doc():
name = random.choice(os.listdir('douban'))
data = open('douban/%s'%name, 'rb').read()
print 'random choice ', name
return name, data
texts = []
for name, title, data in DoubanDoc():
def etl(s): #remove 标点和特殊字符
s = regex.sub('', s)
return s
seg = filter(lambda x: len(x) > 0, map(etl, jieba.cut(data, cut_all=False)))
texts.append(seg)
# remove words that appear only once
all_tokens = sum(texts, [])
token_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in token_once] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = list(DoubanCorpus('douban', dictionary))
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=30)
i = 0
for t in lsi.print_topics(30):
print '[topic #%s]: '%i, t
i+=1
index = similarities.MatrixSimilarity(lsi[corpus])
_, doc = random_doc()
vec_bow = dictionary.doc2bow(jieba.cut(doc, cut_all=False))
vec_lsi = lsi[vec_bow]
print 'topic probability:'
pprint(vec_lsi)
sims = sorted(enumerate(index[vec_lsi]), key=lambda item: -item[1])
print 'top 10 similary notes:'
pprint(sims[:10])
一共有这么些步:
[topic #0]: 0.277*"我" + 0.268*"你" + 0.196*"的" + 0.165*"他" + 0.146*"了" + 0.138*"她" + 0.124*"是" + 0.116*"自己" + 0.111*"在" + 0.107*"人"
[topic #1]: 0.504*"the" + 0.303*"to" + 0.268*"and" + 0.265*"of" + 0.235*"I" + 0.235*"a" + 0.219*"you" + 0.178*"in" + 0.175*"is" + 0.139*"that"
[topic #2]: -0.732*"你" + -0.172*"我" + -0.119*"爱" + 0.107*"的" + 0.088*"中国" + 0.076*"和" + 0.075*"年" + 0.065*"与" + 0.061*"他们" + 0.061*"中"
[topic #3]: -0.620*"她" + -0.288*"他" + 0.281*"你" + -0.160*"我" + -0.099*"说" + -0.098*"了" + -0.092*"啊" + 0.089*"与" + 0.080*"的" + 0.067*"中国"
[topic #4]: 0.524*"她" + -0.264*"我" + 0.246*"他" + 0.186*"你" + -0.160*"啊" + -0.138*"了" + 0.110*"女人" + 0.097*"爱" + 0.095*"男人" + 0.093*"与"
[topic #5]: -0.741*"他" + 0.459*"她" + 0.155*"你" + 0.097*"月" + 0.076*"日" + 0.072*"啊" + 0.068*"1" + 0.067*"2" + 0.062*"年" + -0.062*"我"
[topic #6]: -0.367*"他" + -0.331*"你" + 0.188*"自己" + 0.140*"她" + 0.130*"生活" + -0.128*"啊" + -0.128*"月" + -0.119*"日" + -0.117*"1" + 0.116*"我"
[topic #7]: 0.162*"自己" + -0.153*"着" + -0.138*"在" + 0.120*"做" + -0.116*"它" + 0.113*"别人" + -0.112*"我们" + -0.112*"里" + 0.109*"工作" + 0.104*"啊"
[topic #8]: 0.521*"I" + 0.445*"you" + -0.386*"the" + -0.253*"of" + 0.190*"me" + 0.160*"my" + 0.144*"t" + 0.128*"love" + -0.113*"and" + 0.092*"your"
[topic #9]: 0.302*"說" + 0.198*"我們" + 0.193*"對" + 0.187*"來" + 0.181*"一個" + 0.166*"會" + 0.164*"於" + 0.156*"後" + 0.145*"沒" + 0.136*"為"
[topic #10]: -0.300*"月" + -0.287*"日" + -0.215*"年" + -0.176*"爱" + -0.141*"2012" + 0.140*"啊" + -0.132*"2011" + -0.129*"他" + 0.124*"你" + -0.119*"11"
[topic #11]: -0.547*"我" + 0.202*"爱情" + 0.189*"男人" + 0.186*"女人" + 0.174*"吃" + -0.141*"中国" + 0.125*"爱" + 0.123*"啊" + -0.107*"企业" + 0.092*"不要"
[topic #12]: -0.376*"爱" + -0.290*"啊" + -0.240*"爱情" + 0.194*"孩子" + 0.183*"妈妈" + -0.153*"或者" + -0.140*"我" + 0.131*"你" + -0.127*"女人" + -0.124*"男人"
[topic #13]: 0.264*"啊" + -0.245*"爱" + -0.231*"或者" + -0.188*"妈妈" + -0.178*"吃" + -0.177*"那里" + -0.176*"孩子" + -0.167*"我" + -0.119*"不念" + -0.118*"不增"
[topic #14]: -0.349*"孩子" + -0.300*"妈妈" + -0.244*"我们" + -0.220*"啊" + -0.206*"你们" + 0.204*"喜欢" + -0.179*"他们" + -0.131*"父母" + -0.130*"爸爸" + 0.119*"他"
[topic #15]: 0.322*"我们" + -0.210*"孩子" + 0.161*"爱情" + -0.152*"日" + 0.148*"企业" + -0.145*"月" + 0.138*"客户" + 0.133*"元" + 0.126*"产品" + -0.123*"或者"
[topic #16]: 0.347*"我" + -0.249*"我们" + -0.212*"或者" + 0.188*"女人" + 0.165*"男人" + -0.165*"那里" + -0.116*"工作" + -0.111*"不见" + -0.110*"不念" + -0.109*"不增"
[topic #17]: 0.281*"妈妈" + -0.257*"女人" + -0.251*"男人" + 0.239*"豆瓣" + 0.239*"爱" + 0.231*"孩子" + 0.212*"喜欢" + 0.130*"啊" + 0.128*"电影" + -0.125*"月"
[topic #18]: 0.404*"啊" + -0.325*"男人" + -0.324*"女人" + -0.202*"喜欢" + -0.165*"豆瓣" + -0.136*"电影" + 0.116*"她" + -0.109*"孩子" + -0.104*"妈妈" + 0.100*"他"
[topic #19]: -0.357*"我们" + 0.254*"啊" + -0.192*"你们" + 0.163*"女人" + 0.152*"企业" + 0.146*"男人" + -0.139*"喜欢" + -0.136*"吃" + 0.120*"自己" + -0.113*"他们"
[topic #20]: -0.312*"豆瓣" + 0.259*"爱情" + 0.219*"妈妈" + -0.218*"你们" + 0.179*"中国" + -0.169*"男人" + -0.168*"女人" + 0.160*"爱" + -0.153*"您" + -0.138*"我们"
[topic #21]: 0.395*"爱情" + -0.341*"喜欢" + 0.231*"豆瓣" + -0.171*"啊" + -0.143*"中国" + -0.143*"元" + -0.135*"人" + -0.112*"你们" + 0.110*"阅读" + 0.106*"了"
[topic #22]: -0.304*"你们" + 0.296*"爱情" + 0.288*"孩子" + -0.240*"吃" + -0.220*"2012" + -0.167*"爱" + -0.158*"豆瓣" + -0.135*"一年" + 0.113*"他们" + 0.092*"元"
[topic #23]: 0.305*"我们" + 0.261*"妈妈" + -0.237*"爱" + -0.189*"爱情" + 0.188*"女人" + -0.160*"他们" + -0.159*"工作" + 0.140*"男人" + -0.126*"孩子" + -0.123*"我"
[topic #24]: 0.275*"爱" + -0.269*"啊" + 0.240*"豆瓣" + 0.231*"中国" + -0.213*"爱情" + -0.182*"工作" + -0.159*"喜欢" + -0.155*"我" + -0.123*"生活" + -0.109*"2012"
[topic #25]: 0.355*"你们" + -0.210*"我们" + 0.205*"孩子" + -0.166*"妈妈" + 0.142*"2012" + -0.139*"我" + -0.134*"啊" + -0.128*"爱" + -0.110*"电影" + -0.109*"人生"
[topic #26]: -0.304*"豆瓣" + 0.277*"孩子" + -0.270*"妈妈" + -0.168*"日" + -0.166*"他们" + 0.150*"2012" + -0.132*"您" + -0.130*"月" + -0.126*"元" + -0.113*"生活"
[topic #27]: -0.361*"元" + -0.214*"您" + 0.188*"豆瓣" + 0.172*"啊" + 0.167*"喜欢" + 0.141*"他们" + 0.117*"月" + 0.115*"日" + -0.114*"原价" + 0.112*"你们"
[topic #28]: -0.340*"2012" + 0.321*"你们" + -0.315*"您" + -0.226*"爱" + 0.195*"爱情" + -0.168*"我们" + 0.163*"中国" + 0.151*"妈妈" + -0.133*"孩子" + -0.115*"它"
[topic #29]: 0.276*"你们" + 0.245*"妈妈" + 0.219*"2012" + -0.186*"孩子" + -0.162*"豆瓣" + -0.156*"吃" + -0.154*"中国" + -0.143*"生活" + 0.131*"电影" + -0.113*"啊"
这是分类出来的30个topic, 看起来区分度不大,这估计和豆瓣本身特质相光。