信息熵:信息熵越大,信息越多,与以前的消息相比,措辞越独特。
参考:
信息熵求解部分:https://www.jianshu.com/p/468e2af86d59
import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
def calc_ent(x):
"""
calculate shanno ent of x
"""
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
print(ent)
urls=pd.read_csv(r"E:\IdeasTest.csv",encoding="utf-8")
m=len(urls)
print(m)
texts=[]
def removePunctuation(text):
text = re.sub(r'[{}]+'.format(punctuation),'',text)
return text.strip().lower()
#create English stop words list
en_stop = stopwords.words('english')
#英文词干提取
p_stemmer = PorterStemmer()
for j in range(0,m):
print(urls['Ideas'][j])
csvfile=urls['Ideas'][j]
punctuation = '.!,;:?"\'"&/()+*=~@#$%^_{}[]|`°...、——【】‘’“”?《》,。·!……:;-<>'
lower = csvfile.lower()
#str.maketrans创建转化表
remove = str.maketrans('','',string.punctuation)
without_punctuation = lower.translate(remove)
without_punctuation1=removePunctuation(without_punctuation)
for c in string.digits: #去数字
without_punctuation1 = without_punctuation1.replace(c, '')
tokens=word_tokenize(without_punctuation1)
length1=len(tokens)
x1=0
while x1 < length1:
if tokens[x1] in ['...','°','—','_','?','。',',','》','《','”','“',';',':','、','【','】','=','-','——','……','!','·','~','‘','’','/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','$','%','*','@','|','`','#']:
del tokens[x1]
x1 -= 1
length1 -= 1
x1 += 1
print(tokens)
stopped_tokens = [w for w in tokens if not w in en_stop]
print(stopped_tokens)
# stem token词干化
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
print(stemmed_tokens)
# add tokens to list
texts.append(stemmed_tokens)
print(texts)
content=[]
for x in range(0,m):
#将生成的text由list转化为dtype类型
textnp = np.array(texts[x])
entropy=calc_ent(textnp)
content.append([entropy])
dd=pd.DataFrame(content,columns=['entropy'])
dd.to_csv(r"E:\entropy.csv")