小红书笔记主题分析

# -*- coding: UTF-8 -*-
import numpy as np
import argparse
import random
import pickle as pk
import json


def save_json(file, res):
    """
    保存 dict 到本地json 文件 , 自动创建json
    :param file: json 文件名
    :param res: 数据
    :return:
    """
    with open(file, 'w', encoding='utf-8') as wr:
        json.dump(res, wr, ensure_ascii=False, indent=2)


def load_json(file):
    """
    读取本地json文件,得到dict
    :param file: json 文件名
    :return:
    """
    with open(file, 'r', encoding='utf-8') as reader:
        res = json.load(reader)
    return res


def save_cache(file, ob):
    wr = open(file, 'wb')
    pk.dump(ob, wr)
    wr.close()


def load_cache(file):
    re = open(file, 'rb')
    ob = pk.load(re)
    re.close()
    return ob

def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in f:
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim
path = 'sgns.weibo.word'
topn = 200000
vectors, iw, wi, dim = read_vectors(path, topn)
#专业配近视散光度数,定制高度数散光近视! 专柜品质,配整套近视散光100️!
u = '专业配近视散光度数,定制高度数散光近视! 专柜品质,配整套近视散光100️!'
import jieba
def get_sentence_embedding(text):
    words = jieba.lcut(text)
    res = []
    for w in words:
        if w not in vectors:

            continue
        res.append(vectors[w])
    return np.mean(res,axis=0)

import  pandas as pd

df = pd.read_excel('小红书博主_魅力坊眼镜店_.xlsx')
raw_corpus = [v for v in df['笔记描述'].values]
import os
if not os.path.exists('embeding.pk'):
    resd = {}
    data = []
    for row in raw_corpus:
        res = get_sentence_embedding(row)
        resd[row] = res
        data.append(res)
    print('finish')
    save_cache('embeding.pk',resd)

else:
    resd =  load_cache('embeding.pk')
    data = []
    for row in raw_corpus:
        res =  resd[row]

        data.append(res)
    print('finish')
from sklearn.cluster import KMeans
import numpy as np
X = np.array(data)
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
pds =[]
for text, label in zip(raw_corpus,kmeans.labels_):
    pds.append((text,label))
pd.DataFrame(pds,columns=['文本','标签']).to_excel('魅力分析.xlsx')

首先需要去 某宝平台找到小红书的爬虫店铺,然后根据关键词或者 博主名字进行笔记爬取。爬取完之后是一个xlsx文件

Python库

sklearn

pandas

基本原理

用Kmeans方法可以进行无监督的聚类。而Kmeans的输入数据必须是向量特征。因此我用网上开源的词向量把文本转化成向量。再把文本中不同的单词的向量求平均值。

最后把kmeans的结果进行导出。每一个笔记会对应一个类别

你可能感兴趣的:(新媒体,小红书,NLP)