利用gensim主题模型寻找相似的coursera课程

参考http://www.52nlp.cn/如何计算两个文档的相似度三

#encoding=utf-8
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

courses=[line.strip() for line in file('coursera_corpus')]
courses_name=[course.split('\t')[0] for course in courses]

texts_tokenized=[[word.lower()for word in word_tokenize(document.decode('utf-8'))]
				 for document in courses]

#用nltk中的停用词列表滤除课程语料中的停用词
english_stopwords=stopwords.words('english')
texts_filtered_stopwords=[[word for word in document if word not in english_stopwords]
							for document in texts_tokenized]

#过滤标点符号
english_punctuations=[',','.',':','?','(',')','[',']','&','!','*','@','#','$','%']
texts_filted=[[word for word in document if word not in english_punctuations]
			   for document in texts_filtered_stopwords]

#提取词干
st=LancasterStemmer()
texts_stemmed=[[st.stem(word) for word in document]
				for document in texts_filted]

#去掉只出现一次的词汇
from collections import defaultdict
frequency=defaultdict(int)
for text in texts_stemmed:
	for token in text:
		frequency[token]+=1

texts=[[token for token in text if frequency[token]>1]
		for text in texts_stemmed]

from gensim import corpora, models, similarities
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary=corpora.Dictionary(texts)
corpus=[dictionary.doc2bow(text) for text in texts]

tfidf=models.TfidfModel(corpus)
corpus_tfidf=tfidf[corpus]
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=10)
index=similarities.MatrixSimilarity(lsi[corpus])

print "the query course is:",courses_name[174]
ml_course=texts[174]
ml_bow=dictionary.doc2bow(ml_course)
ml_lsi=lsi[ml_bow]
sims=index[ml_lsi]
sort_sims=sorted(enumerate(sims),key=lambda item:-item[1])
courses_nameTop=[tup[0]for tup in sort_sims[0:10]]
courses_sim=[courses_name[num] for num in courses_nameTop]

print "the similarity courses are:"
for doc in courses_sim:
	print doc


你可能感兴趣的:(利用gensim主题模型寻找相似的coursera课程)