对爬取到的微博进行数据处理,jieba分词去停词,tf-idf关键词提取,LDA :主题模型

主要实现在csv文件里对某一列进行提取,对其进行分词,去停用词,使用TF-IDF :提取关键词等操作,并把处理好的数据放入到csv文件中去。方便接下来的操作。

# -*- coding:utf-8 -*-
import jieba
import numpy as np
import pandas as pd
fileee="D:\PycharmProjects2020\qingganfenlei\data\weibo1.csv"
# 读取txt列
df = pd.read_csv(fileee,usecols=['name1','txt'],encoding='gbk')
# print(df)
df.dropna(axis=0,subset = ["txt"])   # 丢弃txt这列中有缺失值的行
# print(df.head(3))
# print(df.shape)

# 提取txt这一列进行结巴分词,然后把分号的放进content_S这个列表
txt = df.txt.values.tolist()
name=df.name1.values.tolist()
# print (txt[1])
content_S = []
for line in txt:
    current_segment = jieba.lcut(line)#直接生成一列jieba.lcut 直接生成的就是一个list
    if len(current_segment) > 1 and current_segment != '\r\n': #换行符
        content_S.append(current_segment)
# print(content_S[0])
#把content_S做成一列 起名字
df_content=pd.DataFrame({'content_S':content_S})

# print(df_content.head())
#加载停用词表
stopwords=pd.read_csv("data/结巴stopwords.txt", index_col=False, sep="\t", quoting=3, names=['stopword'], encoding='utf-8')
# print(stopwords.head(3))
#定义停用词去除方法
def drop_stopwords(contents,stopwords):
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))
        contents_clean.append(line_clean)
    return contents_clean,all_words
# print(contents_clean)
# contents stopwords 定义成为列表,把每一行装进列表里
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
#取出处理好的句子,和词
contents_clean,all_words = drop_stopwords(contents,stopwords)
#放处理好的句子
df_content=pd.DataFrame({'contents_clean':contents_clean,'txt':txt,'name':name})
print(df_content.head())
# 想写进几列 就在pd.DataFrame({'contents_clean':contents_clean,'txt':txt})加入  txt是这样提出来的,才能往里放 txt = df.txt.values.tolist()
df_content.to_csv('D:\PycharmProjects2020\qingganfenlei\data\weibo3.csv', columns=['name','txt','contents_clean'],mode='a',index=0,encoding='gbk') #index=0不保存行索引,columns保存索引列和name列,mode='a'追加写入

# #放处理好的词
# df_all_words=pd.DataFrame({'all_words':all_words})
# print(df_all_words.head())



# # TF-IDF :提取关键词
# import jieba.analyse
# index = 0#取第几行
# print (df['txt'][index])
# content_S_str = "".join(content_S[index])#把这一行的主要关键字给取出来
# # print(content_S_str)
# print ("  ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))

LDA :主题模型


#   LDA :主题模型 格式要求:list of list(就是要求你提前分好词的)形式,保证成为分词好的的整个语料
from gensim import corpora, models, similarities
import gensim
#做映射,相当于词袋
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) #类似Kmeans自己指定K值
#一号分类结果
# print (lda.print_topic(1, topn=5))


# for topic in lda.print_topics(num_topics=20, num_words=5):
#     print(topic[1])
df_train = pd.DataFrame({'contents_clean': contents_clean, 'label': df['biaoqian']})#
# print(df_train.tail())
# print(df_train.label.unique())


# label替换,映射成数字。   其实数据中并没有label
label_mapping = {"好": 3, "坏": 4}
df_train['label'] = df_train['label'].map(label_mapping)
# print(df_train.head())


# 划分训练集,测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values,df_train['label'].values,  random_state=1)#
# print(x_train[0][1])

# 主要是把列表转成字符串,  (' '.join(x_train[line_index]))作用是把列表转化为字符串
words = []
for line_index in range(len(x_train)):
    try:
        # x_train[line_index][word_index] = str(x_train[line_index][word_index])
        words.append(' '.join(x_train[line_index]))
    except:
        print (line_index)

# print(words[0])#相当于一篇文章
# print (len(words))

# 用CountVectorizer构造向量生成
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(analyzer='word', max_features=4000,  lowercase = False)
vec.fit(words)
# 导入贝叶斯,fit进vec.transform(words)输入特征和标签y_train
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(words), y_train)

# 模型评估模块
from sklearn.metrics import classification_report
# 测试集同样操作来一遍
test_words = []
for line_index in range(len(x_test)):
    try:
        #x_train[line_index][word_index] = str(x_train[line_index][word_index])
        test_words.append(' '.join(x_test[line_index]))
    except:
         print (line_index)
# print(test_words[0])
print(classifier.score(vec.transform(test_words), y_test))
# print(vec.transform(words))
predict=classifier.predict(vec.transform(test_words))
print(predict)







# # 使用tfid构造向量 要偏高一些
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(analyzer='word', max_features=4000,  lowercase = False)
# vectorizer.fit(words)
#
# from sklearn.naive_bayes import MultinomialNB
# classifier = MultinomialNB()
# classifier.fit(vectorizer.transform(words), y_train)
#
# print(classifier.score(vectorizer.transform(test_words), y_test))

你可能感兴趣的:(nlp自然语言处理)