基于TF-IDF的关键字提取

TF-IDF:衡量某个词对文章的重要性由TF和IDF组成
TF:词频(因素:某词在同一文章中出现次数)
IDF:反文档频率(因素:某词是否在不同文章中出现)
TF-IDF = TF*IDF

词频(TF)=某个词在当前文章中出现的次数 / 当前文章的总词数
反文档频率(IDF)=Log( 语料库的文档总数 / ( 包含该词的文档数+1 ) )

"""
Step 1:原始数据预处理
Step 2:产生IDF词表
Step 3:提取句子的关键词
"""
file_path_dir = "./data"
raw_path = './raw.data'
idf_path = './idf.data'


def read_file_handler(file_path):
    fd = open(file_path, 'r', encoding='utf-8')
    return fd


file_raw_out = open(raw_path, 'w', encoding='utf-8')

file_name = 0
for fd in os.listdir(file_path_dir):
    file_path = file_path_dir + '/' + fd
    content_list = []
    file_fd = read_file_handler(file_path)
    for line in file_fd:
        content_list.append(line.strip())  # strip:去除前后的制表符,空格
    # content = str(file_name) + '\t' + ' '.join(content_list) + '\n'
    content = '\t'.join([str(file_name), ' '.join(content_list)]) + '\n'
    file_raw_out.writelines(content)
    file_name += 1
file_raw_out.close()

# step 2
docs_cnt = file_name
wc_tulist = []

with open(raw_path, 'r', encoding='utf-8') as fd:
    for line in fd:
        ss = line.strip().split('\t')
        if len(ss) != 2:
            continue
        file_name, file_content = ss
        # 句子-》word
        word_list = file_content.strip().split(' ')
        word_set = set(word_list)

        for word in word_set:
            wc_tulist.append((word, '1'))

file_idf_out = open(idf_path, 'w', encoding='utf-8')

wc_sort_tulist = sorted(wc_tulist, key=lambda x: x[0])

current_word = None
sum = 0
for tu in wc_sort_tulist:
    word, val = tu

    if current_word == None:
        current_word = word

    # 计算word个数
    if current_word != word:
        # 通过idf计算公式,得到每个关键词的idf score
        idf = math.log(float(docs_cnt) / (float(sum) + 1.0))
        content = '\t'.join([current_word, str(idf)]) + '\n'
        file_idf_out.write(content)
        current_word = word
        sum = 0

    sum += int(val)# 此处的sum可以循环加是因为wc_tulist是每篇文章词都去重,但不同文章相同的词会让sum+1
idf = math.log(float(docs_cnt) / (float(sum) + 1.0))
content = '\t'.join([current_word, str(idf)]) + '\n'
file_idf_out.write(content)

file_idf_out.close()

# step 3
input_str = "我们 带来 阿里巴巴 希望 差 差 差"

token_idf_dict = {
     }

# 将idf字典加载到内存
with open(idf_path, 'r', encoding='utf-8') as fd:
    for line in fd:
        ss = line.strip().split('\t')
        if len(ss) != 2:
            continue
        token, idf_score = ss
        token_idf_dict[token] = idf_score  # key:词语, value:反文档频率idf


def get_tfidf(input_str):
    token_dict = {
     }
    # 对输入字符串的每一个词,计算tf
    for t in input_str.strip().split(' '):
        if t not in token_dict:
            token_dict[t] = 1
        else:
            token_dict[t] += 1

    for k, v in token_dict.items():
        tf_score = token_dict[k]
        if k not in token_idf_dict:
            continue
        idf_score = token_idf_dict[k]
        tf_idf = tf_score * float(idf_score)
        yield (k, tf_idf)  # 变形的return


for k, v in get_tfidf(input_str):
    print(k, v)

你可能感兴趣的:(大数据)