构建词频矩阵, 从而得到TF、IDF

""" 构建词频矩阵, 从而得到TF、IDF"""

import csv
import math
import re
import jieba
import numpy as np
import scipy.sparse as ss

# 加载补充词典
jieba.load_userdict("./words/dict.txt")
stopwords = []  # 去除停用词
for stop in open("./words/stop_words", 'r'):
    stopwords.append(stop.split('\n')[0])
    stopwords.append('')
    stopwords.append(' ')

stop_word = [u'的', u'是', u'和', u'丨']


# 预切断句子,以免得到太多无意义(不是中文、英文、数字)的字符串
def text_filter(line):
    line_arr = []
    w = u'#'
    # for line in texts:
    for word in stop_word:
        line = line.replace(word, w)
    line_arr.append(line)

    for a in line_arr:
        # 这个正则表达式匹配的是任意非中文、非英文、非数字,因此它的意思就是用任意非中文、非英文的字符断开句子
        for t in re.split(u'[^\u4e00-\u9fa5a-zA-Z]+', a):
            if t:
                yield t


def cut(txt):
    word_list = []
    for text in text_filter(txt):
        word_list.extend([x for x in jieba.cut(text) if x not in stopwords and len(x) > 1])
    # print(word_list)
    return word_list


def get_col():
    documents = csv.reader(open('20.csv', 'r'))
    col = np.array(list(documents)).shape[0]
    return col


def word_matrix(col):
    documents = csv.reader(open('20.csv', 'r'))
    dictionary = {}
    idx = 0
    matrix = ss.dok_matrix((0, col), dtype=np.int16)
    for id, txt in enumerate(documents):
        words = cut(txt[0] + txt[1])
        words = [x for x in words if re.match("^[\u4e00-\u9fa5]+[\u4e00-\u9fa5]$", x)]
        for word in words:
            b = ss.dok_matrix((1, col), dtype=np.int16)
            if word not in dictionary.keys():
                dictionary[word] = idx
                idx += 1
                b[0, id] = 1
                matrix = ss.vstack([matrix, b]).tocsr()
            else:
                matrix[dictionary[word], id] += 1
    return matrix, dictionary


col = get_col()
print(col)  # 文档有多少行,如果自己知道也可省略上一步,下一步直接输入文档数
mat, dictionary = word_matrix(col)
print(mat.shape, '\n', dictionary)
for word, idx in dictionary.items():
    tf = int(np.sum(mat[idx]))
    nonz = np.nonzero(mat[idx])
    nonsum = len(nonz[0]) + 1
    idf = math.log(col / nonsum)
    print(word, tf, idf)

 

你可能感兴趣的:(NLP)