贝叶斯分类器python,实验一 贝叶斯分类器的python实现

题目描述

汽车评价数据集

共1728个数据,每个数据特征为6维,分为4类,类别标记为unacc,acc,good,V-good

四个类别标记分别表示汽车性价比等级(由低到高)

unacc:1210个

acc:384个

good:69个

V-good:65个

6个特征分别为:(6个属性)

1、buying (取值:v-high、high、med、low) 表示购买价格

2、maint (取值: v-high、high、med、low) 表示维修价格

3、door (取值:2、3、4、5-more) 车门数量

4、Persons (取值:2、4、more) 可容纳人数

5、Lug_boot (取值:small、med、big) 行李箱大小

Safety (取值:low、med、high) 安全系数

链接:http://archive.ics.uci.edu/ml/datasets/Car+Evaluation

实验完成要求:

1.仔细阅读并了解实验数据集;

2.使用任何一种熟悉的计算机语言(比如 C,Java或者MATLAB)实现朴素贝叶斯算法;

3.利用朴素贝叶斯算法在训练数据上学习分类器,训练数据的大小分别设置为:前100个数据,前200个数据,前500个数据,前700个数据。前1000个数据,前1350个数据;

4.利用测试数据对学习的分类器进行性能评估;

5.演示实验,提交代码,统计分析实验结果并上交实验报告;

开始做题

想要实现贝叶斯分类器,可以分为两个部分,一个是训练,另一部分是检验。

训练即将贝叶斯公式用代码语言描述,具体的贝叶斯公式这里就不赘述了

更艰难的是数据的分类,需要大量的重复性代码

其实绕来绕去就是一个统计+计算判断

代码实现

运行结果

import csv

import random

#import pandas

#数据导入及分成两份

def loadcsv(name):

f = csv.reader(open(name,'r'))

dataset = list(f)

return dataset

def randDivision(dataset , trainSize):

copy = list(dataset)

train = []

while len(train)

index = random.randrange(len(copy))

train.append(copy.pop(index))

return [train, copy]

#初始化一些数据

data1 = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]

dataunacc = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]

dataacc = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]

datagood = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]

dataVgood = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]

datavip = [0,0,0,0]

#统计函数,将具体个数的多少进行统计

def stat(dataset):

for i in dataset:

count = i[0]

maint = i[1]

door = i[2]

persons = i[3]

lug = i[4]

safty = i[5]

vip = i[6]

addcount(count, data1)

addmaint(maint, data1)

adddoor(door, data1)

addperson(persons, data1)

addlug(lug, data1)

addsafty(safty, data1)

if vip == 'unacc':

datavip[0] = datavip[0]+1

addcount(count, dataunacc)

addmaint(maint, dataunacc)

adddoor(door, dataunacc)

addperson(persons, dataunacc)

addlug(lug, dataunacc)

addsafty(safty, dataunacc)

elif vip == 'acc':

datavip[1] = datavip[1] + 1

addcount(count, dataacc)

addmaint(maint, dataacc)

adddoor(door, dataacc)

addperson(persons, dataacc)

addlug(lug, dataacc)

addsafty(safty, dataacc)

elif vip == 'good':

datavip[2] = datavip[2] + 1

addcount(count, datagood)

addmaint(maint, datagood)

adddoor(door, datagood)

addperson(persons, datagood)

addlug(lug, datagood)

addsafty(safty, datagood)

elif vip == 'vgood':

datavip[3] = datavip[3] + 1

addcount(count, dataVgood)

addmaint(maint, dataVgood)

adddoor(door, dataVgood)

addperson(persons, dataVgood)

addlug(lug, dataVgood)

addsafty(safty, dataVgood)

##上面函数的仔函数

def addcount(count,data):

if count == 'vhigh':

data[0][0] = data[0][0]+1

elif count == 'high':

data[0][1] = data[0][1]+1

elif count == 'med':

data[0][2] = data[0][2] + 1

elif count == 'low':

data[0][3] = data[0][3] + 1

def addmaint(maint,data):

if maint == 'vhigh':

data[1][0] = data[1][0]+1

elif maint == 'high':

data[1][1] = data[1][1] + 1

elif maint == 'med':

data[1][2] = data[1][2] + 1

elif maint == 'low':

data[1][3] = data[1][3] + 1

def adddoor(door,data):

if door == '2':

data[2][0] = data[2][0] + 1

elif door == '3':

data[2][1] = data[2][1] + 1

elif door == '4':

data[2][2] = data[2][2] + 1

elif door == '5more':

data[2][3] = data[2][3] + 1

def addperson(persons,data):

if persons == '2':

data[3][0] = data[3][0] + 1

elif persons == '4':

data[3][1] = data[3][1] + 1

elif persons == 'more':

data[3][2] = data[3][2] + 1

def addlug(lug,data):

if lug == 'small':

data[4][0] = data[4][0] + 1

elif lug == 'med':

data[4][1] = data[4][1] + 1

elif lug == 'big':

data[4][2] = data[4][2] + 1

def addsafty(safty,data):

if safty == 'low':

data[5][0] = data[5][0] + 1

elif safty == 'med':

data[5][1] = data[5][1] + 1

elif safty == 'high':

data[5][2] = data[5][2] + 1

##将具体的个数转化为概率

def getP(num, data):

for k in range(len(data)):

for ii in range(len(data[k])):

data[k][ii] = data[k][ii]/num

##以下的函数为概率返回函数,再检验时起到查表的作用

def getR0(data, t):

if t == 'vhigh':

return data[0][0]

elif t == 'high':

return data[0][1]

elif t == 'med':

return data[0][2]

elif t == 'low':

return data[0][3]

def getR1(data, maint):

if maint == 'vhigh':

return data[1][0]

elif maint == 'high':

return data[1][1]

elif maint == 'med':

return data[1][2]

elif maint == 'low':

return data[1][3]

return 0

def getR2(data, door):

if door == '2':

return data[2][0]

elif door == '3':

return data[2][1]

elif door == '4':

return data[2][2]

elif door == '5more':

return data[2][3]

def getR3(data, persons):

if persons == '2':

return data[3][0]

elif persons == '4':

return data[3][1]

elif persons == 'more':

return data[3][2]

def getR4(data, lug):

if lug == 'small':

return data[4][0]

elif lug == 'med':

return data[4][1]

elif lug == 'big':

return data[4][2]

def getR5(data, safty):

if safty == 'low':

return data[5][0]

elif safty == 'med':

return data[5][1]

elif safty == 'high':

return data[5][2]

def getR6(num):

if num == 0:

return "unacc"

if num == 1:

return "acc"

if num == 2:

return "good"

if num == 3:

return "vgood"

rate = [0,0]

##检验函数

def test(testset):

for line in testset:

rate0 = datavip[0]*getR0(dataunacc,line[0])*getR1(dataunacc,line[1])*getR2(dataunacc,line[2])*getR3(dataunacc,line[3])*getR4(dataunacc,line[4])*getR5(dataunacc,line[5])

rate1 = datavip[1]*getR0(dataacc,line[0])*getR1(dataacc,line[1])*getR2(dataacc,line[2])*getR3(dataacc,line[3])*getR4(dataacc,line[4])*getR5(dataacc,line[5])

rate2 = datavip[2]*getR0(datagood,line[0])*getR1(datagood,line[1])*getR2(datagood,line[2])*getR3(datagood,line[3])*getR4(datagood,line[4])*getR5(datagood,line[5])

rate3 = datavip[3]*getR0(dataVgood,line[0])*getR1(dataVgood,line[1])*getR2(dataVgood,line[2])*getR3(dataVgood,line[3])*getR4(dataVgood,line[4])*getR5(dataVgood,line[5])

k = getbig(rate0,rate1,rate2,rate3)

if line[6] != getR6(k):

rate[0] = rate[0]+1

else:

rate[1] = rate[1]+1

##返回四个数中的最大值的下表

def getbig(r1,r2,r3,r4):

k = max(r1, r2, r3, r4)

if k == r1:

return 0

if k == r2:

return 1

if k == r3:

return 2

if k == r4:

return 3

##总函数,调用上面的所有函数

def training(size):

trainSet, testSet = randDivision(dataset, size)

stat(trainSet)

getP(datavip[0], dataunacc)

getP(datavip[1], dataacc)

getP(datavip[2], datagood)

getP(datavip[3], dataVgood)

test(testSet)

Rate = rate[1] / (rate[0] + rate[1])

print("当训练数据大小为{0}个时,剩余数据的检测正确率为{1}".format(size, Rate))

if __name__=="__main__":

name = "..\..\data\car.csv"

dataset = loadcsv(name)

training(100)

training(200)

training(500)

training(700)

training(1350)

你可能感兴趣的:(贝叶斯分类器python)