2018年泰迪杯数据挖掘比赛c题

一、题目
提升景区及酒店等旅游目的地美誉度是各地文旅主管部门和旅游相关企业非常重视和关注的工作,涉及到如何稳定客源、取得竞争优势、吸引游客到访消费等重要事项。游客满意度与目的地美誉度紧密相关,游客满意度越高,目的地美誉度就越大。因此掌握目的地游客满意度的影响因素,切实提高游客满意度、最终提升目的地美誉度,不仅能够保证客源稳定,而且对于旅游企业科学监管、资源优化配置以及市场持续开拓具有长远而积极的作用。
第一问:景区及酒店印象分析
依据附件1中景区及酒店网评文本,按赛题表1格式计算出目的地TOP20热门词,并保存为文件“印象词云表.xls
”。

题目过程:
1、提取出excel中每一行存入单独的csv文件中

import pandas as pd
#输出酒店的数据
def xls_txt(inputfile):
        data = pd.read_excel(inputfile,index_col='酒店名称')
        for i in range(51,61):
            if i<10:
                index = 'H0' + str(i)
            else:
                index='H'+str(i)
            data1 = data.loc[index]
            data1 = data1[[u'评论内容']]
            outputfile = '../test/1/' + index + '.txt'
            data1.to_csv(outputfile, index=False, header=False, encoding='utf-8')
#输出景区的数据
def xls_txt2(inputfile):
    data = pd.read_excel(inputfile, index_col='景区名称')
    for i in range(51, 61):
        if i < 10:
            index = 'A0' + str(i)
        else:
            index = 'A' + str(i)
        data1 = data.loc[index]
        data1 = data1[[u'评论内容']]
        outputfile = '../test/1/' + index + '.txt'
        data1.to_csv(outputfile, index=False, header=False, encoding='utf-8')

inputfile1 = '../test/酒店评论(测试数据).xlsx' #评论汇总文件
inputfile2 = '../test/景区评论(测试数据).xlsx' #评论汇总文件
xls_txt(inputfile1)
xls_txt2(inputfile2)

2、将csv文件转换为txt文件

#-*- coding: utf-8 -*-
import pandas as pd

inputfile = '../data/huizong.csv' #评论汇总文件
outputfile = '../data/meidi_jd.txt' #评论提取后保存路径
data = pd.read_csv(inputfile, encoding = 'utf-8')
data = data[[u'评论']][data[u'品牌'] == u'美的']
data.to_csv(outputfile, index = False, header = False, encoding = 'utf-8')

2、机械语句压缩,删除其中的短语句视为无效句子

import pandas as pd
#机械语句压缩
def func(st):
    for i in range(1, int(len(st) / 2) + 1):
        for j in range(len(st)):
            if st[j:j + i] == st[j + i:j + 2 * i]:
                k = j + i
                while st[k:k + i] == st[k + i:k + 2 * i] and k < len(st):
                    k = k + i
                st = st[:j] + st[k:]
    str1 = "".join(st)
    str2 = str1.strip()
    return str(str2) #记录删除了多少次无效语句


#导入文件
def run(inputfile,outputfile):
    f = open(inputfile, encoding='utf-8')
    filelist = []
    while True:
        line = f.readline()
        if line:
            #重复语句压缩
            data=func(line)
            # 判断是否为短语句(少于四个字则视为无效语句)
            if len(data)<=4:
                continue;
            else:
                filelist.append(data)
        else:
            break
    f.close()
    filelist2 = pd.DataFrame(filelist)
    filelist2.to_csv(outputfile, index=False, header=False, encoding='utf-8')
#运行函数
'''for i in range(1, 51):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile='../train/jingqu/jingqu_cl/'+index+'.txt'#评论处理前保存路径
    outputfile = '../train/jingqu/jingqu_cl/cl_pr/'+index+'.txt'#评论处理后保存路径
    run(inputfile,outputfile)'''
for i in range(51, 61):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile='../test/2/'+index+'.txt'
    # 输出文件
    outputfile = '../test/3/'+index+'.txt'
    run(inputfile,outputfile)
for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile='../test/2/'+index+'.txt'
    # 输出文件
    outputfile = '../test/3/'+index+'.txt'
    run(inputfile,outputfile)

3、删除其中重复的句子,视为无效文本

import pandas as pd
import re
import jieba.posseg as psg#停用词 自带词性
import numpy as np
import nltk
from pandas import DataFrame
#文本去重
def clean_same(inputfile,outputfile):
    # 去重,去除完全重复的数据
    reviews = pd.read_csv(inputfile,encoding = 'utf-8', header = None)
    l1=len(reviews)
    reviews = reviews.drop_duplicates()
    l2 = len(reviews)
   # print(u'删除了%s条评论。' %(l1 - l2))
    # 由于评论主要为酒店和景区的评论,因此去除这些词语
    # 删除字母
    # 删除一些常见的符号
    reviews_cut = pd.DataFrame(reviews[0].str.replace('.*?\d+?\\t ', ' '))
    content = pd.DataFrame(reviews_cut[0].str.replace('[0-9a-zA-Z]|景区|酒店', ''))
    content = pd.DataFrame(content[0].str.replace('[,!!:_.+-=——,$%^。?、~@#¥%……&*《》<>「」{}【】()/]', ''))
    content.to_csv(outputfile, index=False, header=False, encoding='utf-8')


for i in range(51, 61):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile='../test/1/'+index+'.txt'
    # 输出文件
    outputfile = '../test/2/'+index+'.txt'
    clean_same(inputfile,outputfile)
for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile='../test/1/'+index+'.txt'
    # 输出文件
    outputfile = '../test/2/'+index+'.txt'
    clean_same(inputfile,outputfile)

4、简单进行分词(jieba分词),删除停用词(其中还整理了自己的停用词),提取出里面的名词

#coding:utf-8
import pandas as pd
import re
import jieba.posseg as psg
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def word(inputfile,outputfile):
    reviews = pd.read_csv(inputfile, encoding='utf-8', names=['评论']).astype(str)
    content = reviews['评论']
    worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)]  # 自定义简单分词函数
    seg_word = content.apply(worker)
    # print(seg_word)

    # 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
    n_word = seg_word.apply(lambda x: len(x))  # 每一评论中词的个数

    n_content = [[x + 1] * y for x, y in zip(list(seg_word.index), list(n_word))]
    index_content = sum(n_content, [])  # 将嵌套的列表展开,作为词所在评论的id

    seg_word = sum(seg_word, [])
    word = [x[0] for x in seg_word]

    nature = [x[1] for x in seg_word]  # 词性


    result = pd.DataFrame({"index_content": index_content,
                           "word": word,
                           "nature": nature})
    # 删除标点符号
    result = result[result['nature'] != 'x']  # x表示标点符号

    # 删除停用词
    stop_path = open('../stop/stopword2.txt', 'r', encoding='UTF-8')
    stop = stop_path.readlines()
    stop = [x.replace('\n', '') for x in stop]
    word = list(set(word) - set(stop))
    result = result[result['word'].isin(word)]

    # 构造各词在对应评论的位置列
    n_word = list(result.groupby(by=['index_content'])['index_content'].count())
    index_word = [list(np.arange(0, y)) for y in n_word]
    index_word = sum(index_word, [])  # 表示词语在改评论的位置

    # 合并评论id,评论中词的id,词,词性,评论类型
    result['index_word'] = index_word

    # 提取含有名词类的评论
    ind = result[['n' in x for x in result['nature']]]['index_content'].unique()
    result = result[[x in ind for x in result['index_content']]]

    # 提取含有名词类的评论
    ind = result[['n' in x for x in result['nature']]]['index_content'].unique()
    result = result[[x in ind for x in result['index_content']]]
    # 将结果写出
    result.to_csv(outputfile, index=False, encoding='utf-8')


for i in range(51, 61):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile='../test/3/'+index+'.txt'
    # 输出文件
    outputfile = '../test/5/'+index+'.csv'
    word(inputfile, outputfile)

for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile = '../test/3/' + index + '.txt'  # E\jingquA01.csv
    # 输出文件
    outputfile = '../test/5/' + index + '.csv'
    word(inputfile, outputfile)

5、按统计顺序打印出高频词

# -*- coding: utf-8 -*-
import pandas as pd
from collections import Counter

def word(inputfile,outputfile):
    data = pd.read_csv(inputfile)
    words = data['word']
    counter = Counter(words)

    # 打印前十高频词
    #pprint(counter.most_common(20))
    count = counter.most_common(2000)

    name = ['comment', 'count']
    data = pd.DataFrame(columns=name,data=count)
    cols = ['number'] + list(data.columns)
    data.index += 1
    data['number'] = data.index
    data2 = data[cols]
    data2.to_excel(outputfile)

'''for i in range(1, 51):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile = '../data/G2_jingquWord/' + index + 'word.csv'
    # 输出文件
    outputfile = '../train/jingqu/jingqu_cipin/' + index + '.csv'
    word(inputfile, outputfile)'''
'''for i in range(1, 51):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile = '../data/H2_jiudianWord/' + index + 'word.csv'
    # 输出文件
    outputfile = '../train/jiudian/jiudian_cipin/' + index + '.csv'
    word(inputfile, outputfile)'''
inputfile = '../test/5_A/all_A.csv'
# 输出文件
outputfile = '../test/5_A/A_cipin.xlsx'
word(inputfile, outputfile)

inputfile = '../test/5_H/all_H.csv'
# 输出文件
outputfile = '../test/5_H/H_cipin.xlsx'
word(inputfile, outputfile)

第二问:根据附件1景区及酒店网评文本及附件2景区及酒店得分建立合理的数学模型及相应算法,按满分为5分对景区及酒店的服务、位置、设施、卫生、性价比五个方面进行评分,并按照均方误差(Mean Squared Error, MSE)进行模型评价。
1、通过连接百度api,将每个句子按中性、积极以及消极进行分类。

# -*- coding: utf-8 -*-
import json
import requests
import pandas as pd
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


def get_sentiment_result(text):
    """
    利用情感倾向分析API来获取返回数据
    :param text: 输入文本
    :return response: 返回的响应
    """
    if text == '':
        return ''
    # 请求接口
    url = "https://aip.baidubce.com/oauth/2.0/token"
    # 需要先获取一个 token
    client_id = 'vWIz7LcVIEXn7txTgtwMcPll'
    client_secret = '9kOPdWOGGtjQxTQnTXEzTs4zBOuYOiu6'
    params = {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret
    }
    headers = {'Content-Type': 'application/json; charset=UTF-8'}
    response = requests.post(url=url, params=params, headers=headers).json()
    access_token = response['access_token']

    # 情感分析接口
    url = 'https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify'

    # 使用 token 调用情感倾向分析接口
    params = {
        'access_token': access_token
    }
    payload = json.dumps({
        'text': text
    })
    headers = {'Content-Type': 'application/json; charset=UTF-8'}
    response = requests.post(url=url, params=params, data=payload, headers=headers).json()
    return response

def baidu_emotion(inputfile,outputfile):
    # 读取要测试的文本
    text = pd.read_csv(inputfile,encoding='utf-8', names=['评论内容']).astype(str)
    review = text['评论内容']
    length = len(review)
    # 初始化用来存储情感分析结果的列表
    sentiment = ['blank'] * length#表示情感极性分类结果,0:负向,1:中性,2:正向
    negative_prob = ['blank'] * length#表示属于消极类别的概率,取值范围[0,1]
    positive_pro=['blank'] * length#表示属于积极类别的概率 ,取值范围[0,1]
    confidence = ['blank'] * length
    time_start = time.time()  # 计时
    i = 0
    for content in review:
        if content:
            op = True  # 利用循环和输出条件来保证获取到情绪分析的结果
            while op:
                maxTryNum = 50  # 设置最大尝试访问的次数,通过多次访问保证不会因为访问受限制而得不到结果(可修改)
                for tries in range(maxTryNum):
                    # print(content)
                    try:
                        result = get_sentiment_result(content)
                        break
                    except:
                        if tries < (maxTryNum - 1):
                            continue
                        else:
                            print('尝试了 %d 次都失败了!!!', maxTryNum)
                            break
                # 因为发现如果能够成功调用api则输出结果长度为3,失败了长度为2,故将其设为控制输出的条件
                if len(result) == 3:
                    op = False
                else:
                    op = True

            # 处理异常情况
            if 'items' in list(result.keys()):
                result1 = result.get('items')
                item = result1[0]
                sentiment[i] = item['sentiment']
                positive_pro[i]=item['positive_prob']
                negative_prob[i]=item['negative_prob']
                confidence[i]=item['confidence']

            elif 'error_code' in list(result.keys()):
                sentiment[i] = -1
                negative_prob[i]=-1
                positive_pro[i]=-1
                confidence[i]=-1
            # 方便观察进度
            print('第 ' + str(i + 1) + ' 条评论已分析完成,一共'+str(length)+'条评论')
            i = i + 1
    time_end = time.time()
    print('分析评论一共耗时:', time_end - time_start)
    print(sentiment)
    text['评论内容'] = review
    text['情感倾向'] = sentiment
    text['positive_prob'] = positive_pro
    text['negative_prob']=negative_prob
    text['置信度']=confidence

    # 保存
    text.to_csv(outputfile, index=None, encoding="utf_8_sig")

#景区评价
for i in range(54, 61):
    if i < 10:
        index = 'A0' + str(i)
    else:
        index = 'A' + str(i)
    inputfile='../test/3/'+index+'.txt'
         #输出文件
    outputfile = '../test/4/'+index+'.csv'
    baidu_emotion(inputfile, outputfile)
    print('文件' + str(index) + '完成')

2、对情感词进行修正,并绘制出词云

# -*- coding: utf-8 -*-

# 代码12-6 匹配情感词

import pandas as pd
import numpy as np
word = pd.read_csv("../train/jingqu/jingqu_stop2/A01_word.csv")

# 读入正面、负面情感评价词
pos_comment = pd.read_csv("../train/express_data/正面评价词语(中文).txt", header=None,sep="\n",
                          encoding = 'utf-8', engine='python')
neg_comment = pd.read_csv("../train/express_data/负面评价词语(中文).txt", header=None,sep="\n",
                          encoding = 'utf-8', engine='python')
pos_emotion = pd.read_csv("../train/express_data/正面情感词语(中文).txt", header=None,sep="\n",
                          encoding = 'utf-8', engine='python')
neg_emotion = pd.read_csv("../train/express_data/负面情感词语(中文).txt", header=None,sep="\n",
                          encoding = 'utf-8', engine='python') 

# 合并情感词与评价词
positive = set(pos_comment.iloc[:,0])|set(pos_emotion.iloc[:,0])
negative = set(neg_comment.iloc[:,0])|set(neg_emotion.iloc[:,0])
intersection = positive&negative  # 正负面情感词表中相同的词语
positive = list(positive - intersection)
negative = list(negative - intersection)
positive = pd.DataFrame({"word":positive,
                         "weight":[1]*len(positive)})
negative = pd.DataFrame({"word":negative,
                         "weight":[-1]*len(negative)}) 

posneg = positive.append(negative)

#  将分词结果与正负面情感词表合并,定位情感词
data_posneg = posneg.merge(word, left_on = 'word', right_on = 'word', 
                           how = 'right')
data_posneg = data_posneg.sort_values(by = ['index_content','index_word'])



# 代码12-7 修正情感倾向

# 根据情感词前时候有否定词或双层否定词对情感值进行修正
# 载入否定词表
notdict = pd.read_csv("../train/data/not.csv")

# 处理否定修饰词
data_posneg['amend_weight'] = data_posneg['weight']  # 构造新列,作为经过否定词修正后的情感值
data_posneg['id'] = np.arange(0, len(data_posneg))
only_inclination = data_posneg.dropna()  # 只保留有情感值的词语
only_inclination.index = np.arange(0, len(only_inclination))
index = only_inclination['id']

for i in np.arange(0, len(only_inclination)):
    review = data_posneg[data_posneg['index_content'] == 
                         only_inclination['index_content'][i]]  # 提取第i个情感词所在的评论
    review.index = np.arange(0, len(review))
    affective = only_inclination['index_word'][i]  # 第i个情感值在该文档的位置
    if affective == 1:
        ne = sum([i in notdict['term'] for i in review['word'][affective - 1]])
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -\
            data_posneg['weight'][index[i]]          
    elif affective > 1:
        ne = sum([i in notdict['term'] for i in review['word'][[affective - 1, 
                  affective - 2]]])
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -\
            data_posneg['weight'][index[i]]
            
# 更新只保留情感值的数据
only_inclination = only_inclination.dropna()

# 计算每条评论的情感值
emotional_value = only_inclination.groupby(['index_content'],
                                           as_index=False)['amend_weight'].sum()
print(emotional_value)
# 去除情感值为0的评论
emotional_value = emotional_value[emotional_value['amend_weight'] != 0]



# 代码12-8 查看情感分析效果

# 给情感值大于0的赋予评论类型(content_type)为pos,小于0的为neg
emotional_value['a_type'] = ''
emotional_value['a_type'][emotional_value['amend_weight'] > 0] = '2'
emotional_value['a_type'][emotional_value['amend_weight'] < 0] = '0'

# 查看情感分析结果
result = emotional_value.merge(word, 
                               left_on = 'index_content', 
                               right_on = 'index_content',
                               how = 'left')

result = result[['index_content','content_type', 'a_type']].drop_duplicates() 
confusion_matrix = pd.crosstab(result['content_type'], result['a_type'], 
                               margins=True)  # 制作交叉表
(confusion_matrix.iat[0,0] + confusion_matrix.iat[1,1])/confusion_matrix.iat[2,2]

# 提取正负面评论信息
ind_pos = list(emotional_value[emotional_value['a_type'] == '2']['index_content'])
ind_neg = list(emotional_value[emotional_value['a_type'] == '0']['index_content'])
posdata = word[[i in ind_pos for i in word['index_content']]]
negdata = word[[i in ind_neg for i in word['index_content']]]

# 绘制词云
'''import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 正面情感词词云
freq_pos = posdata.groupby(by = ['word'])['word'].count()
freq_pos = freq_pos.sort_values(ascending = False)
backgroud_Image=plt.imread('../train/data/pl.jpg')
wordcloud = WordCloud(font_path="STZHONGS.ttf",
                      max_words=100,
                      background_color='white',
                      mask=backgroud_Image)
pos_wordcloud = wordcloud.fit_words(freq_pos)
plt.imshow(pos_wordcloud)
plt.axis('off') 
plt.show()
# 负面情感词词云
freq_neg = negdata.groupby(by = ['word'])['word'].count()
freq_neg = freq_neg.sort_values(ascending = False)
neg_wordcloud = wordcloud.fit_words(freq_neg)
plt.imshow(neg_wordcloud)
plt.axis('off') 
plt.show()
'''
# 将结果写出,每条评论作为一行
posdata.to_csv("../train/jingqu/Emotional_correction/A01_posdata.csv", index = False, encoding = 'utf-8')
negdata.to_csv("../train/jingqu/Emotional_correction/A01_negdata.csv", index = False, encoding = 'utf-8')

3、将消极、积极以及中性三种情感态度人工分成了五个方面。(仅展示部分代码)

# -*- coding: utf-8 -*-
import pandas as pd

def score(inputfiles,index):
    data1=pd.read_csv(inputfiles)

    contents=data1['评论内容']

    data2 = pd.read_excel('../test/jiudian/jd位置.xlsx')
    location=data2['位置']

    data3 = pd.read_excel('../test/jiudian/jd服务.xlsx')
    service=data3['服务']

    data4 = pd.read_excel('../test/jiudian/jd设施.xlsx')
    value=data4['设施']

    data5 = pd.read_excel('../test/jiudian/jd卫生.xlsx')
    health=data5['卫生']

    data6 = pd.read_excel('../test/jiudian/jd性价比.xlsx')
    facility=data6['性价比']

    location_score = ['blank'] * len(location)  # 位置
    service_score = ['blank'] * len(service)  # 服务
    value_score = ['blank'] * len(value)  # 设施
    health_score = ['blank'] * len(health)  # 卫生
    facility_score = ['blank'] * len(facility)  # 性价比
    i=0

    for location1 in location:
        count1 = 0
        for content in contents:
            if str(location1) in content:
                count1=1+count1
                location_score[i]=count1
        i = i + 1
    data2['次数']=location_score
    data2.to_excel('../test/jiudian/位置/'+index+'.xlsx',encoding='utf-8')

    i = 0
    for service1 in service:
        count1 = 0

        for content in contents:
            if str(service1) in content:
                count1 = 1 + count1
                service_score[i] = count1
        i = i + 1
    data3['次数'] = service_score
    data3.to_excel('../test/jiudian/服务/'+index+'.xlsx',encoding='utf-8')

    i = 0
    for value1 in value:
        count1 = 0
        for content in contents:
            if str(value1) in content:
                count1 = 1 + count1
                value_score[i] = count1
        i = i + 1
    data4['次数'] = value_score
    data4.to_excel('../test/jiudian/设施/'+index+'.xlsx',encoding='utf-8')

    i = 0
    for health1 in health:
        count1 = 0

        for content in contents:
            if str(health1) in content:
                count1 = 1 + count1
                health_score[i] = count1
        i = i + 1
    data5['次数'] = health_score
    data5.to_excel('../test/jiudian/卫生/'+index+'.xlsx',encoding='utf-8')

    i = 0
    for facility1 in facility:
        count1 = 0
        for content in contents:
            if str(facility1) in content:
                count1 = 1 + count1
                facility_score[i] = count1
        i = i + 1
    data6['次数'] = facility_score
    data6.to_excel('../test/jiudian/性价比/'+index+'.xlsx',encoding='utf-8')

for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile='../test/4/'+index+'.csv'
    score(inputfile,index)

4、进行评分。(部分代码)

# -*- coding: utf-8 -*-
import pandas as pd

def ratio(inputfiles,outputfiles):
    data=pd.read_excel(inputfiles)
    data['消极'] = data['消极'].replace('blank', '0')
    data['积极'] = data['积极'].replace('blank', '0')
    data['中等'] = data['中等'].replace('blank', '0')

    data['积极_r']=data['积极'].astype(float)/data['次数']
    data['消极_r']=data['消极'].astype(float)/data['次数']
    data['中等_r'] = data['中等'].astype(float)/ data['次数']
    data.to_excel(outputfiles)

input2 = ['服务', '位置', '设施', '卫生', '性价比']
b=0
while b<=4:
    i = 51
    while i <= 60:
        if i < 10:
            index = 'H0' + str(i)
        else:
            index = 'H' + str(i)

        inputfiles = '../test/jingqu_6_radio/' + input2[b] + '/' + index + '.xlsx'
        outputfiles = '../test/jingqu_6_radio/' + input2[b] + '/' + index + '.xlsx'
        print(inputfiles)
        ratio(inputfiles, outputfiles)
        i = i + 1
    b=b+1

第三问:出于各种原因,网络评论常常出现内容不相关、简单复制修改和无有效内容等现象,妨碍了游客从网络评论中获得有价值的信息,也为各网络平台的运营工作带来了挑战。请从文本分析的角度,建立合理的模型,对附件1景区及酒店网络评论的有效性进行分析。
思路:这里主要用的文本相似度建立模型

# -*- coding: utf-8 -*-
import jieba
import numpy as np
import re
import pandas as  pd


def get_word_vector(s1, s2):
    """
    :param s1: 句子1
    :param s2: 句子2
    :return: 返回句子的余弦相似度
    """
    # 分词
    cut1 = jieba.cut(s1)
    cut2 = jieba.cut(s2)
    list_word1 = (','.join(cut1)).split(',')
    list_word2 = (','.join(cut2)).split(',')

    # 列出所有的词,取并集
    key_word = list(set(list_word1 + list_word2))
    # 给定形状和类型的用0填充的矩阵存储向量
    word_vector1 = np.zeros(len(key_word))
    word_vector2 = np.zeros(len(key_word))

    # 计算词频
    # 依次确定向量的每个位置的值
    for i in range(len(key_word)):
        # 遍历key_word中每个词在句子中的出现次数
        for j in range(len(list_word1)):
            if key_word[i] == list_word1[j]:
                word_vector1[i] += 1
        for k in range(len(list_word2)):
            if key_word[i] == list_word2[k]:
                word_vector2[i] += 1

    # 输出向量
    #print(word_vector1)
    #print(word_vector2)
    return word_vector1, word_vector2


def cos_dist(vec1, vec2):
    """
    :param vec1: 向量1
    :param vec2: 向量2
    :return: 返回两个向量的余弦相似度
    """
    dist1 = float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

    return dist1


def filter_html(html):
    """
    :param html: html
    :return: 返回去掉html的纯净文本
    """
    dr = re.compile(r'<[^>]+>', re.S)
    dd = dr.sub('', html).strip()
    return dd


def inputfile(inputfile):
    contents=pd.read_csv(inputfile)
    contents=contents['评论内容']
    i=1
    count=0
    index = 0
    index2=0
    for content in  contents:
        index = index + 1
        index2=0
        s1=content
        for content2 in contents:
            index2 = index2 + 1
            if i == 1 or content==content2:
                s1 = content
                i = i + 1
                break;
            else:
                s2 = content2
                vec1, vec2 = get_word_vector(s1, s2)
                dist1 = cos_dist(vec1, vec2)
                if dist1>=0.8:
                    contents[i]=''
                    count=count+1
                    print(index)
                    print(s1)
                    print(index2)
                    print(s2)
    print(inputfile+':'+str(count))

for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'H' + str(i)
    inputfile1='../test/4/'+index+'.csv'
    inputfile(inputfile1)

for i in range(51, 61):
    if i < 10:
        index = 'H0' + str(i)
    else:
        index = 'h' + str(i)
    inputfile1='../test/4/'+index+'.csv'
    inputfile(inputfile1)



第四问:建立LDA模型

from gensim import corpora, models
# 建立词典
pos_dict = corpora.Dictionary([[i] for i in posdata['word']])  # 正面
neg_dict = corpora.Dictionary([[i] for i in negdata['word']])  # 负面

# 建立语料库
pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in posdata['word']]]  # 正面
neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in negdata['word']]]   # 负面


# 余弦相似度函数
def cos(vector1, vector2):
    dot_product = 0.0;  
    normA = 0.0;  
    normB = 0.0;  
    for a,b in zip(vector1, vector2): 
        dot_product += a*b  
        normA += a**2  
        normB += b**2  
    if normA == 0.0 or normB==0.0:  
        return(None)  
    else:  
        return(dot_product / ((normA*normB)**0.5))   

# 主题数寻优函数
def topic_search(x_corpus, x_dict):
    
    # 初始化
    mean_similarity = []
    mean_similarity.append(1)
    
    # 生成主题,计算主题间相似度
    for i in np.arange(2,11):
        lda = models.LdaModel(x_corpus, num_topics = i, id2word = x_dict)  # LDA训练
        for j in np.arange(i):
            term = lda.show_topics(num_words = 50)
            
        # 提取主题词
        top_word = []
        for k in np.arange(i):
            top_word.append([''.join(re.findall('"(.*)"',i)) \
                             for i in term[k][1].split('+')])  # 列出所有词
           
        # 构造词频向量
        word = sum(top_word,[])  # 所有的词
        unique_word = set(word)  # 删除重复的词
        
        # 构造主题词列表,行表示主题号,列表示各主题词
        mat = []
        for j in np.arange(i):
            top_w = top_word[j]
            mat.append(tuple([top_w.count(k) for k in unique_word]))
            
        p = list(itertools.permutations(list(np.arange(i)),2))
        len_p = len(p)
        top_similarity = [0]
        for w in np.arange(len_p):
            vector1 = mat[p[w][0]]
            vector2 = mat[p[w][1]]
            top_similarity.append(cos(vector1, vector2))
            
        # 计算平均余弦相似度
        mean_similarity.append(sum(top_similarity)/len_p)
    return(mean_similarity)
            
# 计算主题平均余弦相似度
pos_k = topic_search(pos_corpus, pos_dict)
neg_k = topic_search(neg_corpus, neg_dict)

# 绘制主题平均余弦相似度图形
from matplotlib.font_manager import FontProperties  
font = FontProperties(size=14)

#调整中文的显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False  
fig = plt.figure(figsize=(10,8))
ax_1 = fig.add_subplot(211)
ax_1.plot(pos_k)
ax_1.set_xlabel('正面评论LDA主题数寻优', fontproperties=font)

ax_2 = fig.add_subplot(212)
ax_2.plot(neg_k)
ax_2.set_xlabel('负面评论LDA主题数寻优', fontproperties=font)
#plt.show() #显示主题数寻优的结果

# LDA主题分析
pos_topic = models.LdaModel(pos_corpus, num_topics = 6, id2word = pos_dict)#num_topics 主题数
neg_topic = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict)
print('正面评论LDA主题')
print(pos_topic.print_topics(num_words = 5))#num_words 主题里有多少词
print('负面面评论LDA主题')
print(neg_topic.print_topics(num_words = 5))

完整代码以及数据打包:
2018年泰迪杯数据挖掘比赛c题_第1张图片
下载链接:https://download.csdn.net/download/qq_44700741/85504402

你可能感兴趣的:(数据挖掘,数据挖掘,python,人工智能)