朴素贝叶斯算法Python实现

# -*- coding: cp936 -*-
#朴素贝叶斯分类器的实现
#运行
#reload(docclass)
#c1=docclass.naivebayes(docclass.getwords)
#docclass.sampletrain(c1)
#c1.classify('quick rabbit',default='unknown')
#构建训练样本
def sampletrain(c1):
    c1.train('Nobody owns the water.','good')
    c1.train('the quick rabbit jumps fences','good')
    c1.train('buy pharmaceuticals now','bad')
    c1.train('make quick money at the online casino','bad')
    c1.train('the quick brown fox jumps','good')
import re
import math
#从文本中提取特征
def getwords(doc):
    splitter=re.compile('\\W*')
    #根据非字母字典进行单词拆分
    words=[s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]
    #只返回一组不重复的单词
    return dict([(w,1) for w in words])

#分类器
class classifier:
    def __init__(self,getfeatures):
        self.fc={}
        self.cc={}
        self.getfeatures=getfeatures
        #classifier.__init__(self,getfeatures)
        self.thresholds={}

    def setthreshold(self,cat,t):
        self.thresholds[cat]=t

    def getthreshold(self,cat):
        if cat not in self.thresholds:
            return 1.0
        return self.thresholds[cat]

    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1

    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1

    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0

    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0

    def totalcount(self):
        return sum(self.cc.values())

    def categories(self):
        return self.cc.keys()

    def train(self,item,cat):
        features=self.getfeatures(item)
        for f in features:
            self.incf(f,cat)
        self.incc(cat)

    def fprob(self,f,cat):
        if self.catcount(cat)==0:
            return 0
        return self.fcount(f,cat)/self.catcount(cat)

    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        basicprob=prf(f,cat)

        totals=sum([self.fcount(f,c) for c in self.categories()])

        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp

    def classify(self,item,default=None):
        probs={}

        max=0.0
        for cat in self.categories():
            probs[cat]=self.prob(item,cat)
            if probs[cat]>max:
                max=probs[cat]
                best=cat
                
        for cat in probs:
            if cat==best:
                continue
            if probs[cat]*self.getthreshold(best)>probs[best]:
                return default
            return best
            
class naivebayes(classifier):
        
    def docprob(self,item,cat):
        features=self.getfeatures(item)

        p=1
        for f in features:
            p*=self.weightedprob(f,cat,self.fprob)
        return p

    def prob(self,item,cat):
        catprob=self.catcount(cat)/self.totalcount()
        docprob=self.docprob(item,cat)
        return docprob*catprob
        
    
        

你可能感兴趣的:(算法,python,机器学习)