# -*- coding: UTF-8 -*-
import numpy as np
import argparse
import random
import pickle as pk
import json
def save_json(file, res):
"""
保存 dict 到本地json 文件 , 自动创建json
:param file: json 文件名
:param res: 数据
:return:
"""
with open(file, 'w', encoding='utf-8') as wr:
json.dump(res, wr, ensure_ascii=False, indent=2)
def load_json(file):
"""
读取本地json文件,得到dict
:param file: json 文件名
:return:
"""
with open(file, 'r', encoding='utf-8') as reader:
res = json.load(reader)
return res
def save_cache(file, ob):
wr = open(file, 'wb')
pk.dump(ob, wr)
wr.close()
def load_cache(file):
re = open(file, 'rb')
ob = pk.load(re)
re.close()
return ob
def read_vectors(path, topn): # read top n word vectors, i.e. top is 10000
lines_num, dim = 0, 0
vectors = {}
iw = []
wi = {}
with open(path, encoding='utf-8', errors='ignore') as f:
first_line = True
for line in f:
if first_line:
first_line = False
dim = int(line.rstrip().split()[1])
continue
lines_num += 1
tokens = line.rstrip().split(' ')
vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
iw.append(tokens[0])
if topn != 0 and lines_num >= topn:
break
for i, w in enumerate(iw):
wi[w] = i
return vectors, iw, wi, dim
path = 'sgns.weibo.word'
topn = 200000
vectors, iw, wi, dim = read_vectors(path, topn)
#专业配近视散光度数,定制高度数散光近视! 专柜品质,配整套近视散光100️!
u = '专业配近视散光度数,定制高度数散光近视! 专柜品质,配整套近视散光100️!'
import jieba
def get_sentence_embedding(text):
words = jieba.lcut(text)
res = []
for w in words:
if w not in vectors:
continue
res.append(vectors[w])
return np.mean(res,axis=0)
import pandas as pd
df = pd.read_excel('小红书博主_魅力坊眼镜店_.xlsx')
raw_corpus = [v for v in df['笔记描述'].values]
import os
if not os.path.exists('embeding.pk'):
resd = {}
data = []
for row in raw_corpus:
res = get_sentence_embedding(row)
resd[row] = res
data.append(res)
print('finish')
save_cache('embeding.pk',resd)
else:
resd = load_cache('embeding.pk')
data = []
for row in raw_corpus:
res = resd[row]
data.append(res)
print('finish')
from sklearn.cluster import KMeans
import numpy as np
X = np.array(data)
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
pds =[]
for text, label in zip(raw_corpus,kmeans.labels_):
pds.append((text,label))
pd.DataFrame(pds,columns=['文本','标签']).to_excel('魅力分析.xlsx')
首先需要去 某宝平台找到小红书的爬虫店铺,然后根据关键词或者 博主名字进行笔记爬取。爬取完之后是一个xlsx文件
sklearn
pandas
用Kmeans方法可以进行无监督的聚类。而Kmeans的输入数据必须是向量特征。因此我用网上开源的词向量把文本转化成向量。再把文本中不同的单词的向量求平均值。
最后把kmeans的结果进行导出。每一个笔记会对应一个类别