python机器学习----利用sklearn进行情感分析

import jieba
from collections import defaultdict
import os
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer




def readfile(filename):
    fh=open(filename,'r',encoding='utf-8')
    data=[]
    for x in fh.readlines():
        if(x.strip()!=''):
            data.append(x.strip())
    fh.close()
    return data

#x=readfile("C:/Users/yyq/Desktop/毕业论文/文档1.txt")
#print(x)

#分词处理
def cut2wd(sentence):
    wdlist=jieba.cut(sentence)
    wdrst=[]
    for w in wdlist:
        wdrst.append(w)
    stopwds=readfile("C:/Users/yyq/Desktop/毕业论文/停用词表.txt")
    newwd=[]
    for w2 in wdrst:
        if w2 in stopwds:
            continue
        else:
            newwd.append(w2)
    return newwd


a=cut2wd("我爱北京天安门")
#print(a)         


#词频统计
def Count(words):
    #{"词语":词频,}
    corpus=words
    vectorizer=CountVectorizer(token_pattern="\\b\\w+\\b")#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  
    transformer=TfidfTransformer(norm=None,use_idf=False)#该类会统计每个词语的tf-idf权值  
    tf=transformer.fit_transform(vectorizer.fit_transform(corpus)) #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 
    word=vectorizer.get_feature_names()#获取词袋模型中的所有词语 
    weight=tf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    #print(weight)
    mycp={}
    for i in range(len(weight)):
        for j in range(len(word)):
            mycp.update({str(word[j]):int(weight[i][j])})
    return mycp

b=Count(["我","爱","天安门","爱","明月"])
print(b)

#情感定位
def pos(wddict):
    senlist=readfile("情感语料库")
    sendict=defaultdict()
    for s in senlist:
        sendict[s.split(' ')[0]]=s.split(' ')[1]
    notlist=readfile("否定词表")
    degreelist=readfile("程度副词")
    degreedict=defaultdict()
    for d in degreelist:
        degreedict[d.split(',')[0]]=d.split(',')[1]
    senwd=defaultdict()
    notwd=defaultdict()
    degreewd=defaultdict()
    for word in wddict.keys():
        if word in sendict.keys() and not in notlist and word not in degreedict.keys():
            senwd[wddict[word]]=sendict[word]
        elif word in notlist and word not in degreedict.keys():
            notwd[wddict[word]]=-1
        elif word in degreedict.keys():
            degreewd[wddict[word]]=degreedict[word]
    return senwd,notwd,degreewd

#情感得分计算
def score(senwd,notwd,degreewd,cutrst):
    score=0
    w=1
    senLoc=senwd.keys()
    notloc=notwd.keys()
    degreeloc=degreewd.keys()
    senloc=-1
    for i in range(0,len(cutrst)):
        if i in senLoc:
            senloc+=1
            score+=w*float(senwd[i])
            if senloc 1:
                for j in range(list(senLoc)[senloc],list(senLoc)[senloc+1]):
                    if j in notloc:
                        w*=-1
                    elif j in degreeloc:
                        w*=float(degreewd[j])
        if senloc1:
            i=list(senLoc)[senloc+1]
    return score

str1="这样的工作很好"
cut=sut2wd(str1)
wddict=Count(cut)
senwd,notwd,degreewd=pos(wddict)
rst=score(senwd,notwd,degreewd)
print(rst)


#批量测试               
allposfile=os.listdir("积极情感文件")
for thisfile in allposfile:
    open(""+thisfile,"r",encoding="gbk").read()
    cut=sut2wd(str1)
    wddict=Count(cut)
    senwd,notwd,degreewd=pos(wddict)
    rst=score(senwd,notwd,degreewd)
    print(rst)

你可能感兴趣的:(python机器学习,python自然语言处理)