tf-idf实现文本向量化和关键词提取——python版本

import jieba
import math
text1 = "女排北京奥运会夺冠"
text2 = "北京奥运会的羽毛球男单决赛"
text3 = "中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排"
text4 = "公告显示闻泰科技与格力创投将共同出资设立珠海得尔塔科技有限公司作为指定收购主体然后以现金方式购买"
text5 = "资料显示格力创投成立于2017年格力集团全资控股的一家投资机构共有1次公开投资事件参投企业主要以科技、医药为主"
text6 = "欧菲光主要依靠这一相关业务成为苹果产业链为iPhone供货"
texts = (text1, text2, text3,text4, text5, text6)


class ExtractKeyWord(object):
    def __init__(self, *args):
        self.args = args
        # 分词后的语料库
        self.texts = self.token()
        # 词典
        self.dictionary = self.dictionary()
        # tf值
        self.tf = self.compute_tf()
        # idf值
        self.idf = self.compute_idf()
        # tf-idf值
        self.tfidf = self.compute_tfidf()
        # print(self.texts)
        # print(self.dictionary)

    # 结巴分词
    def token(self):
        text = []
        for content in self.args:
            text.append(list(jieba.lcut(content)))
        return text

    # 生成字典
    def dictionary(self):
        return list(set(word for text in self.texts for word in text))

    # 计算tf
    def compute_tf(self):
        texts = self.texts
        len_dict = len(self.dictionary)
        tf = [[text.count(word)/len_dict for word in self.dictionary] for text in texts]
        return tf

    # 计算idf
    def compute_idf(self):
        texts = self.texts
        lens = len(texts)
        idf = [math.log10(lens/len([1 for text in texts if word in text]) + 1) for word in self.dictionary]
        return idf

    # 计算 tf-idf
    def compute_tfidf(self):
        tfidf = [[(self.dictionary[index], value) for index, value in enumerate(list(map(lambda x, y:x*y, tf, self.idf)))] for tf in self.tf]
        return tfidf

    def extractKeyWord(self):
        for sentence_index, text in enumerate(self.tfidf):
            key_word = [value[0] for value in sorted(text, key=lambda x: x[-1], reverse=True)[:3]]
            print(f'第{sentence_index+1}句话的关键词为{key_word}')


key = ExtractKeyWord(*texts)
key.extractKeyWord()

你可能感兴趣的:(tf-idf,python)