写在之前
本书涉及的源程序和数据都可以在以下网站中找到:http://guidetodatamining.com/
这本书理论比较简单,书中错误较少,动手锻炼较多,如果每个代码都自己写出来,收获不少。总结:适合入门。
欢迎转载,转载请注明出处,如有问题欢迎指正。
合集地址:https://www.zybuluo.com/hainingwyx/note/559139
概率及朴素贝叶斯
特点:分类并给出概率。
先验概率:P(h)
后验概率/条件概率:P(h/d)
# 训练
class Classifier:
def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
""" a classifier will be built from files with the bucketPrefix
excluding the file with textBucketNumber. dataFormat is a string that
describes how to interpret each line of the data files. For example,
for the iHealth data the format is:
"attr attr attr attr class"
"""
total = 0
classes = {}
counts = {}
# reading the data in from the file
self.format = dataFormat.strip().split('\t')
self.prior = {}
self.conditional = {}
# for each of the buckets numbered 1 through 10:
for i in range(1, 11):
# if it is not the bucket we should ignore, read in the data
if i != testBucketNumber:
filename = "%s-%02i" % (bucketPrefix, i)
f = open(filename)
lines = f.readlines()
f.close()
for line in lines:
fields = line.strip().split('\t')
ignore = []
vector = []
for i in range(len(fields)):
if self.format[i] == 'num':
vector.append(float(fields[i])) #vector!!
elif self.format[i] == 'attr':
vector.append(fields[i])
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
category = fields[i]
# now process this instance
total += 1
classes.setdefault(category, 0) #字典:分类类别计数
counts.setdefault(category, {}) #复合字典:每类的每列的具体计数
classes[category] += 1
# now process each attribute of the instance
col = 0
for columnValue in vector:
col += 1
counts[category].setdefault(col, {})
counts[category][col].setdefault(columnValue, 0)
counts[category][col][columnValue] += 1
# ok done counting. now compute probabilities
# first prior probabilities p(h)
for (category, count) in classes.items():
self.prior[category] = count / total#字典:先验概率
# now compute conditional probabilities p(D|h)
for (category, columns) in counts.items():
self.conditional.setdefault(category, {})
for (col, valueCounts) in columns.items():
self.conditional[category].setdefault(col, {})
for (attrValue, count) in valueCounts.items():
self.conditional[category][col][attrValue] = (
count / classes[category]) #复合字典:每类的每个属性的条件概率
self.tmp = counts #应该暂时没有用
# 分类
def classify(self, itemVector):
"""Return class we think item Vector is in"""
results = []
for (category, prior) in self.prior.items():
prob = prior
col = 1
for attrValue in itemVector:
if not attrValue in self.conditional[category][col]:
# we did not find any instances of this attribute value
# occurring with this category so prob = 0
prob = 0
else:
prob = prob * self.conditional[category][col][attrValue]
col += 1
results.append((prob, category))
# return the category with the highest probability
return(max(results)[1])
# test code
c = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")
print(c.classify(['health', 'moderate', 'moderate', 'yes']))
问题:当存在某个概率为0时,直接主导整个贝叶斯的计算过程,即使其他的独立事件的条件概率接近于1。此外,基于样本集估计出来概率往往是真实概率的偏低估计。
改进:将
修改为
其中n是y事件总数,
是y中x事件总数,m是等效样本容量,通常的确定方法是:m为可选属性的个数值,p是可选属性的概率的先验估计,通常假设均匀分布。
当处理的数据是连续的时候,有两种解决办法。一是离散化,构建类别;一是假设概率分布服从高斯分布,然后计算概率。
样本标准差:
对于样本集而言,样本标准差相对于总体标准差计算公式是总体标准差的更优估计。
# pdf计算实现
def pdf(mean, ssd, x):
"""Probability Density Function computing P(x|y)
input is the mean, sample standard deviation for all the items in y,
and x."""
ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))
print (ePart)
return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
# 连续数据的训练
class Classifier:
def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
""" a classifier will be built from files with the bucketPrefix
excluding the file with textBucketNumber. dataFormat is a string that
describes how to interpret each line of the data files. For example,
for the iHealth data the format is:
"attr attr attr attr class"
"""
total = 0
classes = {}
# counts used for attributes that are not numeric
counts = {}
# totals used for attributes that are numereric
# we will use these to compute the mean and sample standard deviation for
# each attribute - class pair.
totals = {}
numericValues = {}
# reading the data in from the file
self.format = dataFormat.strip().split('\t')
#
self.prior = {}
self.conditional = {}
# for each of the buckets numbered 1 through 10:
for i in range(1, 11):
# if it is not the bucket we should ignore, read in the data
if i != testBucketNumber:
filename = "%s-%02i" % (bucketPrefix, i)
f = open(filename)
lines = f.readlines()
f.close()
for line in lines:
fields = line.strip().split('\t')
ignore = []
vector = []
nums = []
for i in range(len(fields)):
if self.format[i] == 'num':
nums.append(float(fields[i]))
elif self.format[i] == 'attr':
vector.append(fields[i])
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
category = fields[i]
# now process this instance
total += 1
classes.setdefault(category, 0)
counts.setdefault(category, {})
totals.setdefault(category, {})
numericValues.setdefault(category, {})
classes[category] += 1
# now process each non-numeric attribute of the instance
col = 0
for columnValue in vector:
col += 1
counts[category].setdefault(col, {})
counts[category][col].setdefault(columnValue, 0)
counts[category][col][columnValue] += 1
# process numeric attributes
col = 0
for columnValue in nums:
col += 1
totals[category].setdefault(col, 0)
#totals[category][col].setdefault(columnValue, 0)
totals[category][col] += columnValue
numericValues[category].setdefault(col, [])
numericValues[category][col].append(columnValue)
#
# ok done counting. now compute probabilities
#
# first prior probabilities p(h)
#
for (category, count) in classes.items():
self.prior[category] = count / total
#
# now compute conditional probabilities p(h|D)
#
for (category, columns) in counts.items():
self.conditional.setdefault(category, {})
for (col, valueCounts) in columns.items():
self.conditional[category].setdefault(col, {})
for (attrValue, count) in valueCounts.items():
self.conditional[category][col][attrValue] = (
count / classes[category])
self.tmp = counts
#
# now compute mean and sample standard deviation
#
self.means = {}
self.totals = totals
for (category, columns) in totals.items():
self.means.setdefault(category, {})
for (col, cTotal) in columns.items():
self.means[category][col] = cTotal / classes[category]
# standard deviation
self.ssd = {}
for (category, columns) in numericValues.items():
self.ssd.setdefault(category, {})
for (col, values) in columns.items():
SumOfSquareDifferences = 0
theMean = self.means[category][col]
for value in values:
SumOfSquareDifferences += (value - theMean)**2
columns[col] = 0
self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1))
# 连续数据的分类
def classify(self, itemVector, numVector):
"""Return class we think item Vector is in"""
results = []
sqrt2pi = math.sqrt(2 * math.pi)
for (category, prior) in self.prior.items():
prob = prior
col = 1
for attrValue in itemVector:
if not attrValue in self.conditional[category][col]:
# we did not find any instances of this attribute value
# occurring with this category so prob = 0
prob = 0
else:
prob = prob * self.conditional[category][col][attrValue]
col += 1
col = 1
for x in numVector:
mean = self.means[category][col]
ssd = self.ssd[category][col]
ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
col += 1
results.append((prob, category))
# return the category with the highest probability
#print(results)
return(max(results)[1])
贝叶斯和kNN的比较
- 贝叶斯优点:实现简单,和其他方法相比需要的训练数据更少
- 贝叶斯缺点:不能学习到特征之间的相互作用。
- kNN优点:实现简单,不用考虑数据特定的结构,需要大量的内存来存储训练集
- kNN缺点:训练集很大的时候是一个合理的选择。
许多真实数据挖掘问题中,很多属性不是独立的。有时候可以假设独立。之所以称朴素贝叶斯是因为尽管知道不成立仍然假设属性之间是独立的。