如何做情感分析,以京东评论为例(jieba+sklearn)

1、引言

作为
我的自然语言处理的第一篇博客,就简单的给大家看看用jieba分词,提取特征,利用机器学习的算法做情感分析的过程,照样,我不多做说明,请看代码,代码有注释

2、介绍

我的文本
来自上一篇博客爬取的京东的评论,都放在了我的mysql库里面,所以这里我只要在mysql中提取我想要的数据就行

3、代码

import jieba
import pymysql
import collections, itertools
import nltk.classify.util, nltk.metrics
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql='SELECT * FROM newjd LIMIT 10000'
cursor.execute(sql)
results = cursor.fetchall()
db.commit()
cursor.close()
db.close()
total=[]
#####去停用词
stopwords = {}.fromkeys(['的', '包括' ,'等', '是',' ',',','。'])
all=[]
#####用jieba分词
for row  in results :

    allcon = jieba.cut(row[0], cut_all=False)

    allcon = list(allcon)

    all.append((allcon,row[1]))
com=[]
label=[]
for content in all:
    com.append(' '.join(content[0]))
    label.append(content[1])
train=com[:300000]
trainlabel=label[:300000]
test=com[300000:]
testlabel=label[300000:]
trainvectorizer = CountVectorizer()
####转换为词频向量
X = trainvectorizer.fit(train)
x_=X.transform(train)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#####选择卡方值最大的1000个词做特征
X_new = SelectKBest(chi2, k=1000).fit(x_, trainlabel)
X_2=X_new.transform(x_)
print(X_2.shape)
testvectorizer = CountVectorizer(vocabulary=trainvectorizer.vocabulary_)
test=X.transform(test)
test=X_new.transform(test)
##将词频向量转换为tfidf向量
newtrain=traintfidf.transform(X_2)
testidf= traintfidf.transform(test)
print(newtrain)

#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import numpy as np
clf = MultinomialNB(alpha=0.01)
clf.fit(newtrain, np.array(trainlabel))
pred = clf.predict(testidf)
print(pred)
def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred)
    m_recall = metrics.recall_score(actual,pred)
    print('predict info:' )
    print(m_precision)
    print (m_recall)
    print (metrics.f1_score(testlabel,pred))

calculate_result(testlabel,pred)

####svm
from sklearn.svm import SVC
print ('*************************\nSVM\n*************************')
svclf = SVC(kernel = 'linear')#default with 'rbf'
svclf.fit(newtrain,trainlabel)
pred = svclf.predict(testidf)
calculate_result(testlabel,pred)

你可能感兴趣的:(自然语言处理)