题目描述
汽车评价数据集
共1728个数据,每个数据特征为6维,分为4类,类别标记为unacc,acc,good,V-good
四个类别标记分别表示汽车性价比等级(由低到高)
unacc:1210个
acc:384个
good:69个
V-good:65个
6个特征分别为:(6个属性)
1、buying (取值:v-high、high、med、low) 表示购买价格
2、maint (取值: v-high、high、med、low) 表示维修价格
3、door (取值:2、3、4、5-more) 车门数量
4、Persons (取值:2、4、more) 可容纳人数
5、Lug_boot (取值:small、med、big) 行李箱大小
Safety (取值:low、med、high) 安全系数
链接:http://archive.ics.uci.edu/ml/datasets/Car+Evaluation
实验完成要求:
1.仔细阅读并了解实验数据集;
2.使用任何一种熟悉的计算机语言(比如 C,Java或者MATLAB)实现朴素贝叶斯算法;
3.利用朴素贝叶斯算法在训练数据上学习分类器,训练数据的大小分别设置为:前100个数据,前200个数据,前500个数据,前700个数据。前1000个数据,前1350个数据;
4.利用测试数据对学习的分类器进行性能评估;
5.演示实验,提交代码,统计分析实验结果并上交实验报告;
开始做题
想要实现贝叶斯分类器,可以分为两个部分,一个是训练,另一部分是检验。
训练即将贝叶斯公式用代码语言描述,具体的贝叶斯公式这里就不赘述了
更艰难的是数据的分类,需要大量的重复性代码
其实绕来绕去就是一个统计+计算判断
代码实现
运行结果
import csv
import random
#import pandas
#数据导入及分成两份
def loadcsv(name):
f = csv.reader(open(name,'r'))
dataset = list(f)
return dataset
def randDivision(dataset , trainSize):
copy = list(dataset)
train = []
while len(train)
index = random.randrange(len(copy))
train.append(copy.pop(index))
return [train, copy]
#初始化一些数据
data1 = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]
dataunacc = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]
dataacc = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]
datagood = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]
dataVgood = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]
datavip = [0,0,0,0]
#统计函数,将具体个数的多少进行统计
def stat(dataset):
for i in dataset:
count = i[0]
maint = i[1]
door = i[2]
persons = i[3]
lug = i[4]
safty = i[5]
vip = i[6]
addcount(count, data1)
addmaint(maint, data1)
adddoor(door, data1)
addperson(persons, data1)
addlug(lug, data1)
addsafty(safty, data1)
if vip == 'unacc':
datavip[0] = datavip[0]+1
addcount(count, dataunacc)
addmaint(maint, dataunacc)
adddoor(door, dataunacc)
addperson(persons, dataunacc)
addlug(lug, dataunacc)
addsafty(safty, dataunacc)
elif vip == 'acc':
datavip[1] = datavip[1] + 1
addcount(count, dataacc)
addmaint(maint, dataacc)
adddoor(door, dataacc)
addperson(persons, dataacc)
addlug(lug, dataacc)
addsafty(safty, dataacc)
elif vip == 'good':
datavip[2] = datavip[2] + 1
addcount(count, datagood)
addmaint(maint, datagood)
adddoor(door, datagood)
addperson(persons, datagood)
addlug(lug, datagood)
addsafty(safty, datagood)
elif vip == 'vgood':
datavip[3] = datavip[3] + 1
addcount(count, dataVgood)
addmaint(maint, dataVgood)
adddoor(door, dataVgood)
addperson(persons, dataVgood)
addlug(lug, dataVgood)
addsafty(safty, dataVgood)
##上面函数的仔函数
def addcount(count,data):
if count == 'vhigh':
data[0][0] = data[0][0]+1
elif count == 'high':
data[0][1] = data[0][1]+1
elif count == 'med':
data[0][2] = data[0][2] + 1
elif count == 'low':
data[0][3] = data[0][3] + 1
def addmaint(maint,data):
if maint == 'vhigh':
data[1][0] = data[1][0]+1
elif maint == 'high':
data[1][1] = data[1][1] + 1
elif maint == 'med':
data[1][2] = data[1][2] + 1
elif maint == 'low':
data[1][3] = data[1][3] + 1
def adddoor(door,data):
if door == '2':
data[2][0] = data[2][0] + 1
elif door == '3':
data[2][1] = data[2][1] + 1
elif door == '4':
data[2][2] = data[2][2] + 1
elif door == '5more':
data[2][3] = data[2][3] + 1
def addperson(persons,data):
if persons == '2':
data[3][0] = data[3][0] + 1
elif persons == '4':
data[3][1] = data[3][1] + 1
elif persons == 'more':
data[3][2] = data[3][2] + 1
def addlug(lug,data):
if lug == 'small':
data[4][0] = data[4][0] + 1
elif lug == 'med':
data[4][1] = data[4][1] + 1
elif lug == 'big':
data[4][2] = data[4][2] + 1
def addsafty(safty,data):
if safty == 'low':
data[5][0] = data[5][0] + 1
elif safty == 'med':
data[5][1] = data[5][1] + 1
elif safty == 'high':
data[5][2] = data[5][2] + 1
##将具体的个数转化为概率
def getP(num, data):
for k in range(len(data)):
for ii in range(len(data[k])):
data[k][ii] = data[k][ii]/num
##以下的函数为概率返回函数,再检验时起到查表的作用
def getR0(data, t):
if t == 'vhigh':
return data[0][0]
elif t == 'high':
return data[0][1]
elif t == 'med':
return data[0][2]
elif t == 'low':
return data[0][3]
def getR1(data, maint):
if maint == 'vhigh':
return data[1][0]
elif maint == 'high':
return data[1][1]
elif maint == 'med':
return data[1][2]
elif maint == 'low':
return data[1][3]
return 0
def getR2(data, door):
if door == '2':
return data[2][0]
elif door == '3':
return data[2][1]
elif door == '4':
return data[2][2]
elif door == '5more':
return data[2][3]
def getR3(data, persons):
if persons == '2':
return data[3][0]
elif persons == '4':
return data[3][1]
elif persons == 'more':
return data[3][2]
def getR4(data, lug):
if lug == 'small':
return data[4][0]
elif lug == 'med':
return data[4][1]
elif lug == 'big':
return data[4][2]
def getR5(data, safty):
if safty == 'low':
return data[5][0]
elif safty == 'med':
return data[5][1]
elif safty == 'high':
return data[5][2]
def getR6(num):
if num == 0:
return "unacc"
if num == 1:
return "acc"
if num == 2:
return "good"
if num == 3:
return "vgood"
rate = [0,0]
##检验函数
def test(testset):
for line in testset:
rate0 = datavip[0]*getR0(dataunacc,line[0])*getR1(dataunacc,line[1])*getR2(dataunacc,line[2])*getR3(dataunacc,line[3])*getR4(dataunacc,line[4])*getR5(dataunacc,line[5])
rate1 = datavip[1]*getR0(dataacc,line[0])*getR1(dataacc,line[1])*getR2(dataacc,line[2])*getR3(dataacc,line[3])*getR4(dataacc,line[4])*getR5(dataacc,line[5])
rate2 = datavip[2]*getR0(datagood,line[0])*getR1(datagood,line[1])*getR2(datagood,line[2])*getR3(datagood,line[3])*getR4(datagood,line[4])*getR5(datagood,line[5])
rate3 = datavip[3]*getR0(dataVgood,line[0])*getR1(dataVgood,line[1])*getR2(dataVgood,line[2])*getR3(dataVgood,line[3])*getR4(dataVgood,line[4])*getR5(dataVgood,line[5])
k = getbig(rate0,rate1,rate2,rate3)
if line[6] != getR6(k):
rate[0] = rate[0]+1
else:
rate[1] = rate[1]+1
##返回四个数中的最大值的下表
def getbig(r1,r2,r3,r4):
k = max(r1, r2, r3, r4)
if k == r1:
return 0
if k == r2:
return 1
if k == r3:
return 2
if k == r4:
return 3
##总函数,调用上面的所有函数
def training(size):
trainSet, testSet = randDivision(dataset, size)
stat(trainSet)
getP(datavip[0], dataunacc)
getP(datavip[1], dataacc)
getP(datavip[2], datagood)
getP(datavip[3], dataVgood)
test(testSet)
Rate = rate[1] / (rate[0] + rate[1])
print("当训练数据大小为{0}个时,剩余数据的检测正确率为{1}".format(size, Rate))
if __name__=="__main__":
name = "..\..\data\car.csv"
dataset = loadcsv(name)
training(100)
training(200)
training(500)
training(700)
training(1350)