这边尝试使用三种方法去提取关键词,我希望能通过一些关键词提取出相应的属性,为了修正我们的评论属性,我还会人为去补充相应的特征词典,关于同义词的一些问题暂且放开不考虑。
直接贴出代码
import json
import jieba
import jieba.analyse
from collections import Counter
import string
from zhon.hanzi import punctuation
stopwords = []
stopword = open('data/chinesestopword.txt','rt')
for line in stopword:
stopwords.append(line.strip())
with open('data/oppo_r15.json','r') as f:
data = json.load(f)
review = []
for i in data:
review.append(i['content'])
score = []
for i in data:
score.append(i['score'])
text = ''.join(c for c in review)
words = jieba.cut(text)
words_list = []
for i in words:
words_list.append(i)
words_list = [x for x in words_list if x not in punctuation]
addword = ['?',' ']
words_list = [x for x in words_list if x not in addword]
words_list = [x for x in words_list if x not in stopwords]
a = Counter(words_list)
print(a.most_common(20))
让我们看一下具体的结果
('手机', 638), ('不错', 541), ('喜欢', 285), ('买', 255), ('京东', 156), ('挺', 145), ('感觉', 139), ('速度', 139)
效果很一般,主打拍照,但是我们没有看到拍照
这边我们都是使用jieba的库来完成我们相应的任务,至于tf-idf,可以参考我的博客
import json
import jieba
import jieba.analyse
from collections import Counter
import string
from zhon.hanzi import punctuation
stopwords = []
stopword = open('data/chinesestopword.txt','rt')
for line in stopword:
stopwords.append(line.strip())
with open('data/oppo_r15.json','r') as f:
data = json.load(f)
review = []
for i in data:
review.append(i['content'])
score = []
for i in data:
score.append(i['score'])
text = ''.join(c for c in review)
keywords = jieba.analyse.extract_tags(text,topK=20)
print(keywords)
这边看一下结果
'手机', '不错', '喜欢', '京东', '拍照', '非常', '手感', '快递', '外观', '好看', 'OPPO', '感觉', '速度', '屏幕', '很漂亮', '收到'
这里看起来靠谱很多了
这边我尝试使用jiaba和snownlp的库函数来完成相应的关键词提取
import json
import jieba
import jieba.analyse
from collections import Counter
import string
from zhon.hanzi import punctuation
stopwords = []
stopword = open('data/chinesestopword.txt','rt')
for line in stopword:
stopwords.append(line.strip())
with open('data/oppo_r15.json','r') as f:
data = json.load(f)
review = []
for i in data:
review.append(i['content'])
score = []
for i in data:
score.append(i['score'])
text = ''.join(c for c in review)
keywords_textrank = jieba.analyse.textrank(text,topK=30,withWeight=False,allowPOS=('ns','n'))
print(keywords_textrank)
'手机', '京东', '外观', '速度', '感觉', '评价', '很漂亮', '有点', '手感', '屏幕', '物流', '用户', '效果', '内容'
不提了,结果太糟糕了
from snownlp import SnowNLP
s = SnowNLP(text)
print(s.keywords(20))