文本分类(3)——文本转词向量

001 常见词向量表达

https://blog.csdn.net/sinat_26917383/article/details/52162589
https://blog.csdn.net/hubin232/article/details/81272126 【比较新】

我就直接用的Bow当做词得权重了。
测试文本用Bow表示

import json
import os
from collections import Counter
import time
#每篇文档用bow表示
def gen_vector(path):
    testVector={}
    cate=os.listdir(path)
    for i,category in enumerate(cate):
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), '>', '=' * 30 + '[' + category + ']' + '=' * 30)
        file_path=path+category+'/'
        file_list=os.listdir(file_path)
        testVector.setdefault(category,{})
        for j,file_name in enumerate(file_list):
            full_path=file_path+file_name
            with open(full_path,"r",encoding='utf-8') as f:
                content=f.read()
            words=content.split()
            totalLen=len(words)
            wordCounter=Counter(words)
            testVector[category].setdefault(file_name,{})
            for idx,wordTuple in enumerate(wordCounter.most_common(len(wordCounter))):
                (word,count)=wordTuple
                testVector[category][file_name].setdefault(word,count/totalLen)
    fname='C:/lyr/DM/feature_reduction/testVector.json'
    with open(fname,'w') as fp:
        json.dump(testVector,fp)

test_path='C:\\lyr\\DM\\test_cut\\'
gen_vector(test_path)

你可能感兴趣的:(学习学习学习)