从excel读取数据,利用情感词典进行文本分类

#从excel读取数据,利用情感词典进行文本分类
数据集:从微博爬取的美妆博主的评论
情感词典:BosonNLP_情感词典
准确率:预测了3000条数据,准确率约53%。对中性评论不敏感。


import openpyxl

import codecs
from collections import defaultdict
import jieba
import xlrd

#分词,去除停用词
def seg_word (sentence):
    #分词
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
    #读取停用词
    stopwords = set()
    fr = codecs.open('D:\个人资料\数据集\情感词典_微博专用\chineseStopWords.txt','r','utf-8')
    for word in fr:
        stopwords.add(word.strip())
    fr.close()
    #去除停用词
    return list (filter(lambda x: x not in stopwords, seg_result))

#对分词结果分类:情感词、否定词、程度副词
#key未索引,value为权值
def classify_words(word_list):
    #读取情感词典
    sen_file  = open('D:\个人资料\数据集\BosonNLP_情感词典\BosonNLP_sentiment_score\BosonNLP_sentiment_score.txt','r+',encoding='utf-8')
    #获取字典内容
    #去除‘\n’
    sen_list = sen_file.read().splitlines()
    #创建情感词典
    sen_dict = defaultdict()
    #读取字典文件每一行内容,将其转换为字典对象,key为情感词,value为对应的分值
    for s in sen_list:
        #对每一行内容根据空格分隔,索引0是情感词,1是情感分值
        if len(s.split(' ')) == 2:
            sen_dict[s.split(' ')[0]] = s.split(' ')[1]
        

    #读取否定词文件
    not_word_file = open('D:\个人资料\数据集\情感词典_微博专用\否定词.txt','r+',encoding='utf-8')
    #否定词没有分值,使用列表
    not_word_list = not_word_file.read().splitlines()

    #读取程度副词文件
    degree_file = open('D:\个人资料\数据集\情感词典_微博专用\程度级别词语(中文).txt','r+',encoding = 'utf-8')
    degree_list = degree_file.read().splitlines()
    degree_dic = defaultdict()
    
    #程度副词转为字典对象,key为词,value为权值
    for d in degree_list:
        #print(d)
        degree_dic[d.split(',')[0]] = d.split(',')[1]
    
    #分类结果,词语索引为key,分值为value,否定词分值为-1
    sen_word = dict()
    not_word = dict()
    degree_word = dict()

    #分类
    for word in word_list:
        if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():
            #找出分词结果中在情感词典中的词
            sen_word[word] = sen_dict[word]
        elif word in not_word_list and word not in degree_dic.keys():
            #分词结果在否定词列表中的词
            not_word[word] = -1
        elif word in degree_dic.keys():
            #分词结果在程度副词中的词
            degree_word[word] = degree_dic[word]
    sen_file.close()
    degree_file.close()
    not_word_file.close()
    #将分类结果返回
    #词语索引为key,分值为value,否定词分值为-1
    return sen_word, not_word, degree_word

#计算每个情感词得分,再相加
def score_sentiment(sen_word, not_word, degreen_word, seg_result):
    #权重初始化为1
    W = 1
    score = 0
    #遍历分词结果
    for i in range(0, len(seg_result)):
        #print(seg_result)
        #
        if seg_result[i] in degreen_word.keys():
            W *= float (degreen_word[seg_result[i]])
            #若是否定词
        elif seg_result[i] in not_word.keys():
                W=W*(-1)#W *= -1
            #若是情感词
        elif seg_result[i] in sen_word.keys():
                score += float(W) * float(sen_word[seg_result[i]])
                W = 1
    return score

'''def score_sentiment(sen_word, not_word, degree_word, seg_result):
    #
    W = 1
    score =0
    #
    sentiment_index = -1
    sentiment_index_list = list(sen_word.keys())
    #
    for i in range(0, len(seg_result)):
        #
        if i in sen_word.keys():
            score += W * float(sen_word[i])
            sentiment_index += 1
            if sentiment_index < len(sentiment_index_list) - 1:
                for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
                    #
                    if j in not_word.keys():
                        W *= -1
                    elif j in degree_word.keys():
                        W *= float(degree_word[j])
        if sentiment_index < len(sentiment_index_list) - 1:
            i = sentiment_index_list[sentiment_index + 1]
    return score#
'''
#调整各函数
def sentiment_score(sentence):
    #分词
    seg_list = seg_word(sentence)
    #将分词结果转为dic,再分类
    sen_word, not_word, degree_word = classify_words(seg_list)
    #计算得分
    score =score_sentiment(sen_word, not_word, degree_word, seg_list)
    return score


#if __name__ == '__main__':
    #score=sentiment_score('我很开心。')

#score=sentiment_score("我很高兴也很开心。")
#print(sentiment_score("我从不开心"))


#读入excel文件
book = openpyxl.load_workbook("D:\软件\python-3.7.0\weiboComments.xlsx")
#将工作簿读入给sh
sh = book.get_sheet_by_name("sheet")
'''
print('预测极性')
i = 1
while (i <= sh.max_row):
    s = str(i)
    if sh['A'+s].value !=None:#判断A列单元格是否为空
        #print(str(sh['A'+s].value))#输出A列单元格的值
        #计算每一条评论的得分并输出
        grade = sentiment_score(str(sh['A'+s].value))
        #print(grade)#输出评论的得分

        #如果得分>0,那么标记为1;若<0 ,则标记为1
        if grade > 0:
            t = 1
        elif grade == 0:
            t = 0
        else:
            t = -1
        
        #print(t)#输出标记(1或-1)'''
    #else :
        #break
   # i += 1
    
#判断预测极性是否与真实极性相同,若相同,则赋值0,否则赋值2
print('预测极性是否与真实极性相同')
j = 1
num0 = 0
num2 = 0
while (j <= sh.max_row):
    s = str(j)
    if sh['B'+s].value != None:
        m1 = str(sh['B'+s].value)#将C列中单元格的值赋值给m1
        #print(m1)
        if sh['D'+s].value != None:
            m2 = str(sh['D'+s].value)#将D列中单元格的值赋值给m2
            if m1 == m2:
                mm = 0#如果相同,则mm赋值为0
                num0 += 1
            else:
                mm = 2#否则赋值为2
                num2 += 1
            print(mm)
    j += 1
print("相同的数量为:")
print(num0),。
print("不同的数量为:")
print(num2)


#计算准确率召回率            

你可能感兴趣的:(文本情感分类)