目录
一、数据格式转换
二、读取刚才处理的评论的txt文件,并进行过滤、分词、去停用词操作,再将处理过得数据存储到txt文件中
三、word2vec向量化处理
四、计算向量相似度
五、情感极性归类
六、完整代码如下
首先利用爬虫软件或代码将数据写入***.xlsx文件中,如图。
由于数据预处理使用txt文件比较方便,所以小编将xlsx文件转换成txt文件,代码如下
#读取excel文件并储存为txt文件
def xlsx2txt(xlsx_position,comment_txt_position,score_txt_position):
#打开excel文件,创建workbook对象,即xlsx文件
doc_read=xlrd.open_workbook(xlsx_position,encoding_override='utf8')
#读取xlsx文件中sheet列表并返回
doc_read.sheets()
#取xlsx文件中第一个sheet工作簿
doc_read_sheet=doc_read.sheet_by_index(0)
#循环工作簿所有行
for row in doc_read_sheet.get_rows():
#获取第一列中的评论内容
commend_column_value=row[0].value
if commend_column_value !='J_commentDetail':
#去掉单条评论中的换行符
commend_column_value_filter = re.sub(r'[\n]', '', commend_column_value)
with open(comment_txt_position,'a+',encoding='utf8')as file:
file.write(commend_column_value_filter+'\n')
#获取第二列中的评分内容
score_column_value=row[1].value
if score_column_value !='score':
#去掉分数中的‘分’字
score_column_value_filter=re.sub(r'[\u4e00-\u9fa5]', '',commend_column_value)
with open(score_txt_position, 'a+', encoding='utf8')as file:
file.write(score_column_value_filter + '\n')
'''#或者
for row in doc_read_sheet.get_rows():
commend_column_value=row[0].value
score_column_value=row[1].value
if commend_column_value !='J_commentDetail':
commend_column_value_filter = re.sub(r'[\n]', '', commend_column_value)
with open(comment_txt_position,'a+',encoding='utf8')as file:
file.write(commend_column_value_filter+'\n')
score_column_value_filter=re.sub(r'[\u4e00-\u9fa5]', '',commend_column_value)
with open(score_txt_position, 'a+', encoding='utf8')as file:
file.write(score_column_value_filter + '\n')
'''
由于爬虫下来的数据有不同的格式,有人是写完一个点后换行再写另一个点,这在后面的步骤中有交叉部分(后面进行句子分词时,需将一条评论中的词放在同一行中)所以需要将相关评论利用正则表达式处理一下
commend_column_value_filter = re.sub(r'[\n]', '', commend_column_value)
其中:
1、过滤使用的是上述的正则表达式,和上一步使用的方法一样,但使用的意义不一样,绝不能混在一起使用,否则将会导致不同条的评论全展现在一条评论中
2、分词使用的是jieba分词(使用pip install jieba下载即可),首先需要过滤到系统默认好评的句子,还需要添加一些自定义的词语(表情、英文、地名等专有名词等)
3、去停用词的停用词表需要自行下载,比较好的有哈工大停用词表、百度停用词表,当然还可以自定义
#读取txt文件
def doc_read(comment_txt_position):
with open(comment_txt_position,'a+',encoding='utf8') as file:
doc=file.read()
return doc
#过滤出中文
def doc_filter(doc):
#re.sub()替换功能
filter=re.sub(r'[^\u4e00-\u9fa5|\n]', '',doc)
#[^\u4e00-\u9fa5]中文以外的全被替换
return filter
#jieba分词
def doc_cut(filter):
new_filter=filter.replace('此用户没有填写评论\n','')
cut=' '.join(jieba.cut(new_filter))
return cut
#去停用词
def doc_stopword(cut,stopword_list_position):
#调用停用词表
stopword_list = [line.strip() for line in open(stopword_list_position, 'r',encoding='utf-8').readlines()]
stopword=''
for word in cut:
if word not in stopword_list:
if word != '\t':
stopword += word
stopwords = ''.join(stopword)
return stopwords
#预处理文本写入txt文件
def doc_preprocess(stopwords,doc_preprocess_position):
with open(doc_preprocess_position, 'a+', encoding='utf8') as file:
file.write(stopwords)
小编是使用gensim包中的word2vec进行向量化处理,可以利用pip install gensim直接下载,但要注意numpy、scipy、gensim这三个包的交叉部分造成的错误产生,建议先卸载python中的numpy、scipy包,再按照numpy、scipy、gensim这个顺序下载包(这是小编使用的anaconda3集成软件包的安装步骤,其他可能不同,参见我的blog---python及其编辑器pycharm安装教程),如果本身没有这两个包的直接按照刚才的顺序下载即可(下载速度慢的童学可以看看我的blog---python更改下载源地址)
#word2vec向量化
def vectoring(doc_preprocess_position,word2vec_model_position,vector_binaryF_position,vector_binary_position,txt_vector_binary_position,vector_size):
#初始化模型
sentences=word2vec.LineSentence(doc_preprocess_position)
#创造临时文件
path=get_tmpfile(word2vec_model_position)
model=word2vec.Word2Vec(sentences,hs=1,min_count=1,window=5,size=vector_size)
#保存训练模型
model.save(word2vec_model_position)
#保存词向量
model.wv.save_word2vec_format(vector_binaryF_position,binary=False)
model.wv.save_word2vec_format(vector_binary_position,binary=True)
##写入二进制向量
#加载二进制词向量
model_binary=word2vec.Word2VecKeyedVectors.load_word2vec_format(vector_binary_position,binary=True)
# vocab词汇表{*,*,*}
output = codecs.open(txt_vector_binary_position, 'w', 'utf-8')
#bin文件装换为txt
for i in model_binary.vocab:
# list()将元组转换成列表
vector = list()
#j表示向量维数
for j in model_binary[i]:
vector.append(str(j))
vector_str = ",".join(vector)
line = i + "\t" + vector_str
output.writelines(line)
output.close()
return model_binary
#计算相似度
def calculate_most_similar(model_binary, word):
similar_words = model_binary.most_similar(word)
print(word)
for term in similar_words:
print(term[0], term[1])
看最上面的图可知,小编爬下来的数据中score一列中含有中文汉字‘‘分’’的影响,需要将其去掉,方法同上。
再者,就是写入分数文件夹中的txt类型的数字读出来的时候可能会被认为是字符串,所以需要将其进行强制转换成float类型,再进行分类处理(小编爬的数据好评太多,正担心后面的lstm模型使用的效果呢╥╯^╰╥)
#情感极性归类
def sentiment_classification(score_txt_position,sentiment_classification_position):
with open(score_txt_position,'a+',encoding='utf8')as file:
#readlines()读行
for i in file.readlines():
score=float(i.strip())
if score<3.0:
with open(sentiment_classification_position,'a+',encoding='utf8')as file:
file.writelines('-1')
elif score>3.0:
with open(sentiment_classification_position, 'a+', encoding='utf8')as file:
file.writelines('0')
else:
with open(sentiment_classification_position,'a+',encoding='utf8')as file:
file.writelines('1')
对于计算相似度那块代码,由于小编后面没有使用到,就没有添加到主函数中了,需要使用的朋友自行调用即可(calculate_most_similar(model_binary, word))#word是你所想要计算相似度的词,如‘好’
import xlrd
import re
import time
import jieba
import codecs
import logging
from gensim.models import word2vec
from gensim.test.utils import get_tmpfile
#读取excel文件并储存为txt文件
def xlsx2txt(xlsx_position,comment_txt_position,score_txt_position):
#打开excel文件,创建workbook对象,即xlsx文件
doc_read=xlrd.open_workbook(xlsx_position,encoding_override='utf8')
#读取xlsx文件中sheet列表并返回
doc_read.sheets()
#取xlsx文件中第一个sheet工作簿
doc_read_sheet=doc_read.sheet_by_index(0)
#循环工作簿所有行
for row in doc_read_sheet.get_rows():
#获取第一列中的评论内容
commend_column_value=row[0].value
if commend_column_value !='J_commentDetail':
#去掉单条评论中的换行符
commend_column_value_filter = re.sub(r'[\n]', '', commend_column_value)
with open(comment_txt_position,'a+',encoding='utf8')as file:
file.write(commend_column_value_filter+'\n')
#获取第二列中的评分内容
score_column_value=row[1].value
if score_column_value !='score':
#去掉分数中的‘分’字
score_column_value_filter=re.sub(r'[\u4e00-\u9fa5]', '',commend_column_value)
with open(score_txt_position, 'a+', encoding='utf8')as file:
file.write(score_column_value_filter + '\n')
'''#或者
for row in doc_read_sheet.get_rows():
commend_column_value=row[0].value
score_column_value=row[1].value
if commend_column_value !='J_commentDetail':
commend_column_value_filter = re.sub(r'[\n]', '', commend_column_value)
with open(comment_txt_position,'a+',encoding='utf8')as file:
file.write(commend_column_value_filter+'\n')
score_column_value_filter=re.sub(r'[\u4e00-\u9fa5]', '',commend_column_value)
with open(score_txt_position, 'a+', encoding='utf8')as file:
file.write(score_column_value_filter + '\n')
'''
#读取txt文件
def doc_read(comment_txt_position):
with open(comment_txt_position,'r',encoding='utf8') as file:
doc=file.read()
return doc
#过滤出中文
def doc_filter(doc):
#re.sub()替换功能
filter=re.sub(r'[^\u4e00-\u9fa5|\n]', '',doc)
#[^\u4e00-\u9fa5]中文以外的全被替换
return filter
#jieba分词
def doc_cut(filter):
new_filter=filter.replace('此用户没有填写评论\n','')
cut=' '.join(jieba.cut(new_filter))
return cut
#去停用词
def doc_stopword(cut,stopword_list_position):
#调用停用词表
stopword_list = [line.strip() for line in open(stopword_list_position, 'r',encoding='utf-8').readlines()]
stopword=''
for word in cut:
if word not in stopword_list:
if word != '\t':
stopword += word
stopwords = ''.join(stopword)
return stopwords
#预处理文本写入txt文件
def doc_preprocess(stopwords,doc_preprocess_position):
with open(doc_preprocess_position, 'a+', encoding='utf8') as file:
file.write(stopwords)
#word2vec向量化
def vectoring(doc_preprocess_position,word2vec_model_position,vector_binaryF_position,vector_binary_position,txt_vector_binary_position,vector_size):
#初始化模型
sentences=word2vec.LineSentence(doc_preprocess_position)
#创造临时文件
path=get_tmpfile(word2vec_model_position)
model=word2vec.Word2Vec(sentences,hs=1,min_count=1,window=5,size=vector_size)
#保存训练模型
model.save(word2vec_model_position)
#保存词向量
model.wv.save_word2vec_format(vector_binaryF_position,binary=False)
model.wv.save_word2vec_format(vector_binary_position,binary=True)
##写入二进制向量
#加载二进制词向量
model_binary=word2vec.Word2VecKeyedVectors.load_word2vec_format(vector_binary_position,binary=True)
# vocab词汇表{*,*,*}
output = codecs.open(txt_vector_binary_position, 'w', 'utf-8')
#bin文件装换为txt
for i in model_binary.vocab:
# list()将元组转换成列表
vector = list()
#j表示向量维数
for j in model_binary[i]:
vector.append(str(j))
vector_str = ",".join(vector)
line = i + "\t" + vector_str
output.writelines(line)
output.close()
return model_binary
#计算相似度
def calculate_most_similar(model_binary, word):
similar_words = model_binary.most_similar(word)
print(word)
for term in similar_words:
print(term[0], term[1])
#情感极性归类
def sentiment_classification(score_txt_position,sentiment_classification_position):
with open(score_txt_position,'a+',encoding='utf8')as file:
#readlines()读行
for i in file.readlines():
score=float(i.strip())
if score<3.0:
with open(sentiment_classification_position,'a+',encoding='utf8')as file:
file.writelines('-1')
elif score>3.0:
with open(sentiment_classification_position, 'a+', encoding='utf8')as file:
file.writelines('1')
else:
with open(sentiment_classification_position,'a+',encoding='utf8')as file:
file.writelines('0')
if __name__=='__main__':
xlsx_position='E:\\pycharm\\data\\***.xls'
comment_txt_position='E:\\pycharm\\data\\***.txt'
score_txt_position='E:\\pycharm\\data\\***.txt'
stopword_list_position='E:\\pycharm\\data\\***.txt'
doc_preprocess_position='E:\\pycharm\\data\\***.txt'
word2vec_model_position='E:\\pycharm\\data\\***.model'
vector_binaryF_position='E:\\pycharm\\data\\***.vector'
vector_binary_position='E:\\pycharm\\data\\***.vector'
txt_vector_binary_position='E:\\pycharm\\data\\***.txt'
score_txt_position = 'E:\\pycharm\\data\\***.txt'
sentiment_classification_position = 'E:\\pycharm\\data\\***.txt'
vector_size=128
# 输出运行日志
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
time_start = time.time()
xlsx2txt(xlsx_position,comment_txt_position,score_txt_position)
time_end_transaction=time.time()
print('---------------------------------------------------------------------')
print('转换文件格式花费了:',time_end_transaction-time_start,'s')
doc=doc_read(comment_txt_position)
time_end_filetr=time.time()
print('---------------------------------------------------------------------')
print('过滤花费了:',time_end_filetr-time_start,'s')
cut=doc_cut(filter)
time_end_jieba = time.time()
print('---------------------------------------------------------------------')
print('分词花费了:', time_end_jieba-time_end_filetr,'s')
stopwords=doc_stopword(cut,stopword_list_position)
time_end_stopword = time.time()
print('---------------------------------------------------------------------')
print('去停用词花费了:', time_end_stopword-time_end_jieba,'s')
doc_preprocess(stopwords,doc_preprocess_position)
time_end_doc_preprocess=time.time()
print('---------------------------------------------------------------------')
print('写入预处理文件花费了',time_end_doc_preprocess-time_end_stopword,'s')
vectoring(doc_preprocess_position, word2vec_model_position, vector_binary_position, txt_vector_binary_position,vector_size)
time_end_vectoring=time.time()
print('---------------------------------------------------------------------')
print('向量化花费了', time_end_vectoring - time_end_doc_preprocess, 's')
sentiment_classification(score_txt_position, sentiment_classification_position)
time_end_sentiment_classification=time.time()
print('---------------------------------------------------------------------')
print('情感分类花费了', time_end_sentiment_classification - time_end_vectoring, 's')
time_end=time.time()
print('---------------------------------------------------------------------')
print('总计花费了',time_end-time_start,'s')