监督学习算法——朴素贝叶斯

import pandas as pd
# CountVectorizer仅对出现的词频进行统计,转化为稀疏矩阵
from sklearn.feature_extraction.text import CountVectorizer
#  TfidfVectorizer对词频进行统计的情况下,进行加权计算
# 计算Tf:词出现在文本中的次数,与文本中总词数的比
# 计算idf:出现词的文本数量与总文本数量的比值
# 其值为Tf*idf
from sklearn.feature_extraction.text import TfidfVectorizer

# 读取数据
data = pd.read_csv("data.csv", encoding="gbk")
print(data.columns)
data_x = data["内容 "]
data_y = data["评价"]
# 将停用词放在集合里
with open("stopwords.txt", "r", encoding="utf-8") as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
stopwords = set(stopwords)
# 进行切词:对每一个字符串,切词之后放在原列表里。目的:去除停用词
import jieba

data_x_current = []
for cut in data_x:  # 取出每个字符串进行切词
    seglist = jieba.cut(cut)
    fill = ""
    for seg in seglist:  # 对切词之后的词,判断是否在停用词集合中,没在则保留
        if seg not in stopwords:
            fill += seg
    data_x_current.append(fill)  # 接收去除停用词后的数据
# print(data_x_current)

# 实例化数字向量化工具,获得一个数字格式。相当于哑变量处理pandas.get_dummies(detail["dishes_name"])
vector = CountVectorizer()
X = vector.fit_transform(data_x_current)
# print(X)
# print(X.toarray()) 输出数组格式的数据
# print(vector.get_feature_names())
re = TfidfVectorizer()
re_s = re.fit_transform(data_x_current)
# print(re_s)
# 将需要预测的结果转化为数字格式
y = [1 if i == "好评" else 0 for i in data_y]
# 另一种方法,哑变量处理,适用于结果是两个选项的
y = pd.get_dummies(data_y)["好评"]
# 导入朴素贝叶斯算法,进行预测
print(y)
from sklearn.naive_bayes import MultinomialNB

# 实例化朴素贝叶斯算法
mult_bayes = MultinomialNB()
# 选取前10条作为训练集
mult_bayes.fit(X[:10], y[:10])
# 选取后3条作为测试集
resule = mult_bayes.predict(X[10:])
# 获取测试的准确率
score = mult_bayes.score(X[10:], y[10:])

# 使用tfidf处理的数据进行预测
mult_bayes2 = MultinomialNB()
mult_bayes2.fit(re_s[:10], y[:10])
result = mult_bayes2.predict(re_s[10:])
print(result)
score2 = mult_bayes2.score(re_s[10:], y[10:])
print(score2)

你可能感兴趣的:(算法)