之前在情感分析方法之nltk情感分析器和SVM分类器(二)一文中的第二部分,仅仅记录了最后一步分类器的处理,现在想要把前四步也记录下来。
1. 原始语料的规整
# -*- coding: utf-8 -*-
# 获取正负向语料库与停用词词典
# 将原始数据规整到一个txt文件中
import os
# 文件夹及结果文件的存储路径
path = r"D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/"
folder = [r'neg', r'pos']
# 读取文件内容
def getContent(filename):
with open(filename, 'rb') as f: # 打开该中文文件的方式需二进制方式打开:rb
contents = f.readline()
f.close()
return contents
# 得到文件名,取出对应文件的内容,新建TXT文件,将内容储存到同一个TXT文件中
for name in folder:
result_path = path + name + r'.txt'
result_file = open(result_path, 'wb+') # 同样新建的储存中文的文件也要是二进制方式:wb+
source_file = path + name
for picname in os.listdir(source_file):
picpath = os.path.join(source_file, picname)
content = getContent(picpath)
result_file.write(content)
result_file.close()
2. 分词,清洗文本,文本分割
# -*- coding: utf-8 -*-
# 用结巴分词对语料进行分析处理
# 分词,清洗文本,文本分割
import jieba
import jieba.analyse
import re
import codecs
def prepareData(sourcefile, targerfile):
source = codecs.open(sourcefile, 'rb')
target = codecs.open(targerfile, 'w', encoding='utf-8')
for line in source:
line = line.decode('gbk', 'ignore')
line = clearTxt(line)
seg_line = sent2word(line)
# seg_line = seg_line.encode('gb2312')
target.writelines(seg_line + '\n')
print('well done!')
source.close()
target.close()
# 去除文本中的特殊符号
def clearTxt(line):
if line != '':
line = line.strip()
# line = line.decode('gb2312')
# 去除文本中的英文和数字
line = re.sub("[a-zA-Z0-9]", "", line)
# 去除中文符号和英文符号
line = re.sub("[\s+\.\!\/_,$%^*(+\"\';:“”.]+|[+——!,。??、~@#¥%……&*()]+", "", line)
return line
# 分词
def sent2word(line):
segList = jieba.cut(line, cut_all=False)
segSentence = ''
for word in segList:
if word != '\t':
segSentence += word + ' '
return segSentence
if __name__ == '__main__':
sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg.txt'
targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg_cut.txt'
prepareData(sourcefile, targetfile)
sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos.txt'
targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos_cut.txt'
prepareData(sourcefile, targetfile)
3. 去除停用词
# -*- coding: utf-8 -*-
# 去除停用词
import codecs
# 加载停用词
def stopword(sourcefile, targerfile, stopkeyfile):
source = codecs.open(sourcefile, 'rb')
target = codecs.open(targerfile, 'w', encoding='utf-8')
stopfile = codecs.open(stopkeyfile, 'rb').readlines()
stopkey = [w.strip() for w in stopfile]
for line in source:
line = line.decode('utf-8', 'ignore')
sentence = delstopword(line, stopkey)
target.writelines(sentence + '\n')
print('well done!')
source.close()
target.close()
# 删除停用词
def delstopword(line, stopkey):
wordlist = line.split(' ')
sentence = ''
for word in wordlist:
word = word.strip()
if word not in stopkey:
if word != '\t':
sentence += word + ' '
return sentence.strip()
if __name__ == '__main__':
sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut.txt'
targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut_stopdel.txt'
stopkeyfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\stopWord.txt'
stopword(sourcefile, targetfile, stopkeyfile)
sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut.txt'
targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut_stopdel.txt'
stopword(sourcefile, targetfile, stopkeyfile)
4. 文本向量化,获取特征词向量
# -*- coding: utf-8 -*-
# 文本向量化:获取特征词向量
import codecs
import gensim
import numpy as np
import pandas as pd
# 构建特征词向量
def getwordvecs(wordlist, model):
vecs = []
for word in wordlist:
word = word.replace('\n', '')
try:
vecs.append(model[word])
except KeyError:
continue
return np.array(vecs, dtype='float')
# 构建文本词向量,主要是对每一句进行矩阵平均来代表每一行
def buildvecs(filename, model):
filevec = []
with codecs.open(filename, 'rb') as contents:
for line in contents:
line = line.decode('utf-8', 'ignore')
wordlist = line.split(' ')
vecs = getwordvecs(wordlist, model)
if len(vecs) > 0:
vecsarray = sum(np.array(vecs)) / len(vecs)
filevec.append(vecsarray)
return filevec
# 已建立word2vec模型,只需加载
if __name__ == '__main__':
fdir1 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/wiki_zh_word2vec-master/'
fdir2 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/'
modelinput = fdir1 + 'wiki.zh.text.vector'
model = gensim.models.KeyedVectors.load_word2vec_format(modelinput, binary=False)
posinput = buildvecs(fdir2 + 'pos_cut_stopdel.txt', model)
neginput = buildvecs(fdir2 + 'neg_cut_stopdel.txt', model)
Y = np.concatenate((np.ones(len(posinput)), np.zeros(len(neginput))))
X = posinput[:]
for neg in neginput:
X.append(neg)
X = np.array(X)
df_x = pd.DataFrame(X)
df_y = pd.DataFrame(Y)
data = pd.concat([df_y, df_x], axis=1)
data.to_csv(fdir2 + '2000_data.csv')
其中对每个句子采用词向量平均的方式来表示;
word2vec是直接加载已训练好的模型。
最后一步就是之前提到的数据降维和情感分类器。