朴素贝叶斯分类留言标题

```python

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score

import pandas as pd

import jieba

def cut(text):

    """

    分隔标题为单个词语 如  '我爱北京天安门' =>  ['我', '爱', '北京', '天安', '天安门'];

    :param text:标题

    :return: 空格分隔的列表,列表里面是各种词语

    """

    return ' '.join(list(jieba.cut(text,cut_all=True)))

def message_classification():

    # 本地读取数据集,并构造target集 和 data集

    ad = pd.read_csv('广告.csv')

    target = ['广告']*len(ad['标题'])

    kaoyan = pd.read_csv('考研.csv')

    target=target+['考研']*len(kaoyan['标题'])

    ad = ad['标题'].to_list()

    kaoyan = kaoyan['标题'].to_list()

    data = []

    for text in ad:

        data.append(cut(text))

    for text in kaoyan:

        data.append(cut(text))

    # print(data[0:10])

    # print(cut('我爱北京天安门'))

    # 划分数据集

    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=33)

    #特征工程 文本抽取

    transfer = TfidfVectorizer()

    x_train = transfer.fit_transform(x_train)

    x_test  = transfer.transform(x_test)

    # 朴素贝叶斯算法预估

    estimator = MultinomialNB()

    # 训练模型机

    #添加3交叉验证

    estimator = GridSearchCV(estimator, param_grid = {}, cv=3)

    estimator.fit(x_train, y_train)

    # 模型评估

    # 1) 直接对比真实值和预测值

    y_predict = estimator.predict(x_test)

    # 计算准确率

    score = estimator.score(x_test, y_test)

    print("准确率: ", score)

    # 计算综合值

    score = estimator.best_score_

    print("综合值: ", score)

    # 计算召回率

    recall = recall_score(y_test, y_predict, average='weighted')

    print("召回率: ", recall)

    return None

if __name__ == '__main__':

    message_classification()

```

    Building prefix dict from the default dictionary ...

    Loading model from cache C:\Users\LOVEWE~1\AppData\Local\Temp\jieba.cache

    Loading model cost 1.345 seconds.

    Prefix dict has been built succesfully.

    准确率:  0.8650519031141869

    综合值:  0.8403288619645175

    召回率:  0.8650519031141869

你可能感兴趣的:(朴素贝叶斯分类留言标题)