给定一个训练集Training-data.txt,构建朴实贝叶斯分类器,并对如下测试用例作出推断:
X1 = (age <=30, Income = medium, Student = yes, Credit_rating = Fair)
X2 = (30<= age <40, Income = high, Student = no, Credit_rating = Fair)
X3 = (age > 40, Income = medium, Student = no, Credit_rating = Fair)
training-data.txt:
age income student credit_rating buys_computer
<=30 high no fair no
<=30 high no excellent no
31…40 high no fair yes
>40 medium no fair yes
>40 low yes fair yes
>40 low yes excellent no
31…40 low yes excellent yes
<=30 medium no fair no
<=30 low yes fair yes
>40 medium yes fair yes
<=30 medium yes excellent yes
31…40 medium no excellent yes
31…40 high yes fair yes
>40 medium no excellent no
解题思路:首先读入训练数据,使用朴素贝叶斯分类来预测未知元组的类标号。利用列表存储各个类标号属性的数目,可以得出P(x|c)P(c),从而预测未知元组的类标号
以下是python代码实现过程:
train_file=open('training-data.txt','r')
train_list=[]
lines=train_file.readlines()
for i in lines:
curdata=i.strip().split('\t')
train_list.append(curdata)
del[train_list[0]]
for i in train_list:
if i[0]=='31\xa1\xad40':
i[0]='31_40'
print train_list
yes_count=0
no_count=0
for i in range(len(train_list)):
if train_list[i][len(train_list[i])-1]=='yes':
yes_count+=1
else:
no_count+=1
#print yes_count,no_count
p_yes=round(yes_count/float(len(train_list)),3)
p_no=round(no_count/float(len(train_list)),3)
def bayes(age_data,income_data,isstudent,cr_data):
x=list()
x=[age_data,income_data,isstudent,cr_data]
x_yes=[0,0,0,0]
x_no=[0,0,0,0]
for i in range(len(train_list)):
for j in range(0,len(train_list[i])-1):
if train_list[i][j]==x[j]:
if train_list[i][len(train_list[i])-1]=='yes':
temp=x_yes[j]
temp+=1
x_yes[j]=temp
else:
temp=x_no[j]
temp+=1
x_no[j]=temp
a=1
for i in x_yes:
a*=i
b=1
for i in x_no:
b*=i
p_x_yes=a/float(yes_count**4)
p_x_no=b/float(no_count**4)
fina_p_yes=round(p_x_yes*p_yes,3)
print 'P(X|buy_computer=yes)=',fina_p_yes
fina_p_no=round(p_x_no*p_no,4)
print 'P(X|buy_computer=no)=',fina_p_no
class_result=""
if fina_p_yes>fina_p_no:
class_result="it belongs to this class:yes"
else:
class_result="it belongs to this class:no"
return class_result
print bayes('<=30','medium','yes','fair')
print bayes('31_40','high','no','fair')