python信息熵的计算

信息熵:信息熵越大,信息越多,与以前的消息相比,措辞越独特。

参考:
信息熵求解部分:https://www.jianshu.com/p/468e2af86d59

导入包

import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

计算信息熵的方法

def calc_ent(x):
    """
        calculate shanno ent of x
    """

    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent
    print(ent)

文本预处理

urls=pd.read_csv(r"E:\IdeasTest.csv",encoding="utf-8")
m=len(urls)
print(m)
texts=[]

def removePunctuation(text):
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    return text.strip().lower()

#create English stop words list
en_stop = stopwords.words('english')
#英文词干提取
p_stemmer = PorterStemmer()

for j in range(0,m):
    print(urls['Ideas'][j])
    csvfile=urls['Ideas'][j]
    punctuation = '.!,;:?"\'"&/()+*=~@#$%^_{}[]|`°...、——【】‘’“”?《》,。·!……:;-<>'
    lower = csvfile.lower()
    #str.maketrans创建转化表
    remove = str.maketrans('','',string.punctuation) 
    without_punctuation = lower.translate(remove)
    without_punctuation1=removePunctuation(without_punctuation)
    for c in string.digits: #去数字
        without_punctuation1 = without_punctuation1.replace(c, '')
    tokens=word_tokenize(without_punctuation1)

    length1=len(tokens)
    x1=0
    while x1 < length1:
        if tokens[x1] in ['...','°','—','_','?','。',',','》','《','”','“',';',':','、','【','】','=','-','——','……','!','·','~','‘','’','/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','$','%','*','@','|','`','#']:
            del tokens[x1]
            x1 -= 1
            length1 -= 1
        x1 += 1
    print(tokens)
        
    stopped_tokens = [w for w in tokens if not w in en_stop]
    print(stopped_tokens)
    # stem token词干化
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    print(stemmed_tokens)
    
    # add tokens to list
    texts.append(stemmed_tokens)
print(texts)

文档信息熵计算

content=[]
for x in range(0,m):
    #将生成的text由list转化为dtype类型
    textnp = np.array(texts[x])
    entropy=calc_ent(textnp) 
    content.append([entropy])
    dd=pd.DataFrame(content,columns=['entropy'])
    dd.to_csv(r"E:\entropy.csv")

你可能感兴趣的:(文本挖掘,python,信息熵)