朴素贝叶斯分类

给定训练样本
('Nobody owns water.','good');

('the quick rabbit jumps fences','good');

('buy pharmaceuticals','bad');

('make quick money at the online casino','bad');

('the quick brown fox jumps','good');

如何判断一个新的样本'quick money'是good还是bad呢,最常用的办法就是朴素贝叶斯分类

朴素贝叶斯分类的步骤大致如下:
1.根据样本集判断每个词属于各个分类的可能性。
  也就是计算一个词的文档频度df
2.对待分类文本中的每一个词,计算相应的df,利用贝叶斯公式,把所有的df相乘,结果在乘以p(目录)的值,就算出了当前文本属于一个分类的概率
import re
import math

def getwords(doc):
	spliter=re.compile('\\W*')
	words=[s.lower() for s in spliter.split(doc) if len(s)>2 and len(s)<20]
	return dict([(w,1) for w in words]);

def sampletrain(cl):
	cl.train('Nobody owns water.','good');
	cl.train('the quick rabbit jumps fences','good');
	cl.train('buy pharmaceuticals','bad');
	cl.train('make quick money at the online casino','bad');
	cl.train('the quick brown fox jumps','good');
	
class classifier:
	def __init__(self,getfeatures,filename=None):
		self.fc={}
		self.cc={}
		self.getfeatures=getfeatures
		self.thresholds={}
		
	def setthreshold(self,cat,t):
		self.thresholds[cat]=t
		
	def getthreshold(self,cat):
		if cat not in self.thresholds: return 1.0
		return self.thresholds[cat]
		
	def classify(self,item,default=None):
		probs={}
		max=0.0
		for cat in self.categories():
			probs[cat]=self.prob(item,cat)
			if probs[cat]>max:
				max=probs[cat]
				best=cat
		
		for cat in probs:
			if cat==best:continue
			if probs[cat]*self.getthreshold(best)>probs[best]:return default
			
		return best
		
	def incf(self,f,cat):
		self.fc.setdefault(f,{})
		self.fc[f].setdefault(cat,0)
		self.fc[f][cat]+=1
		
	def incc(self,cat):
		self.cc.setdefault(cat,0)
		self.cc[cat]+=1
	
	def fcount(self,f,cat):
		if f in self.fc and cat in self.fc[f]:
			return float(self.fc[f][cat])
		return 0.0
		
	def catcount(self,cat):
		if cat in self.cc:
			return float(self.cc[cat])
		return 0.0
		
	def totalcount(self):
		return sum(self.cc.values())
		
	def categories(self):
		return self.cc.keys()
	
	def train(self,item,cat):
		features=self.getfeatures(item)
		for f in features:
			self.incf(f,cat)	
		self.incc(cat)
		
	def fprob(self,f,cat):
		if self.catcount(cat)==0:return 0
		return self.fcount(f,cat)/self.catcount(cat)
		
	def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
		basicprob=prf(f,cat)
		totals=sum([self.fcount(f,c) for c in self.categories()])
		bp=((weight*ap)+(totals*basicprob))/(weight+totals)
		return bp
		
class nativebayes(classifier):
	def docprob(self,item,cat):
		features=self.getfeatures(item)
		p=1
		for f in features:p*=self.weightedprob(f,cat,self.fprob)
		return p
		
	def prob(self,item,cat):
		catprob=self.catcount(cat)/self.totalcount()
		docprob=self.docprob(item,cat)
		return docprob*catprob
		
		
		

你可能感兴趣的:(C++,c,F#,C#)