Python实现朴素贝叶斯算法 --- 过滤垃圾邮件

# -*- coding:utf-8 -*-
import numpy as np
import random
import re
__author__ = 'yangxin'
"""
过滤垃圾邮件
"""


class FilterSpam(object):

    # 分词操作(匹配任意的非单词字符)
    def text_parse(self, big_str):
        token_list = re.split(r'\W+', big_str)
        if len(token_list) == 0:
            print(token_list)
        return [tok.lower() for tok in token_list if len(tok) > 2]

    # 为单词列表去重
    def create_vocab_list(self, data_set):
        vocab_set = set()
        for item in data_set:
            vocab_set = vocab_set | set(item)
        return list(vocab_set)

    # 标记列表中单词是否出现在输入的数据集中
    def set_of_words_to_vec(self, vocab_list, input_set):
        result = [0] * len(vocab_list)
        for word in input_set:
            if word in vocab_list:
                # 如单词在输入文档出现过,则标记为1,否则为0
                result[vocab_list.index(word)] = 1
        return result

    # 训练朴素贝叶斯算法分类器
    def train_naive_bayes(self, train_mat, train_category):
      

你可能感兴趣的:(Python,Machine,Learning)