前段时间学习python3的基础知识,现在做一些数据挖掘方面的小练习。
今天要做的事贝叶斯分类器,数据来源为老师的weather.arff, 首先要读取这个文件,要用到string的方法http://write.blog.csdn.net/postlist和文件读取方法http://write.blog.csdn.net/postlist 和arff文件格式http://blog.sina.com.cn/s/blog_9d40b61301012xci.html,程序如下:
# coding =utf-8 import re import sys def readArff(fileName): arffFile = open(fileName,'r') data = [] for line in arffFile.readlines(): if not (line.startswith('@')): if not (line.startswith('%')): if line !='\n': L=line.strip('\n') k=L.split(',') data.append(k) print(k) print(data) if __name__ =='__main__': fileName=r'C:\Users\Administrator\Desktop\exepirenment\classifill\data\weather.arff' readArff(fileName)
输出结果如下:
['sunny', '85', '85', 'FALSE', 'no'] ['sunny', '80', '90', 'TRUE', 'no'] ['overcast', '83', '86', 'FALSE', 'yes'] ['rainy', '70', '96', 'FALSE', 'yes'] ['rainy', '68', '80', 'FALSE', 'yes'] ['rainy', '65', '70', 'TRUE', 'no'] ['overcast', '64', '65', 'TRUE', 'yes'] ['sunny', '72', '95', 'FALSE', 'no'] ['sunny', '69', '70', 'FALSE', 'yes'] ['rainy', '75', '80', 'FALSE', 'yes'] ['sunny', '75', '70', 'TRUE', 'yes'] ['overcast', '72', '90', 'TRUE', 'yes'] ['overcast', '81', '75', 'FALSE', 'yes'] ['rainy', '71', '91', 'TRUE', 'no']
# coding =utf-8 import re import sys import bisect data =[] #全局变量 def readArff(fileName): arffFile = open(fileName,'r') global data for line in arffFile.readlines(): if not (line.startswith('@')): if not (line.startswith('%')): if line !='\n': L=line.strip('\n') k=L.split(',') data.append(k) def bayesion(testData): class1=[] class2=[] global data for item in data: if item[len(item)-1] == 'yes': class1.append(item) else: class2.append(item) class1Probability = len(class1) /len(data) class2Probability = len(class2) /len(data) for i in range(len(testData)): count = 0 for elem in class1: if testData[i]==elem[i]: count +=1 #统计个数 class1Probability *= count/len(class1) #累计乘法 求总概率 count = 0 for elem in class2: if testData[i]==elem[i]: count +=1 class2Probability *=count/len(class2) if class1Probability >class2Probability: #比较,进而分类 print("The result is : Yes") else: print("The result if : No") #数据预处理,将data数据分箱,data数据为 list[list1,list2...]类型 def dataPreprocessing1(data): breakpoint1 =[70,80] breakpoint2=[80,90] newValue1='LMH' for item in data: i = bisect.bisect(breakpoint1,int(item[1])) #int(),str()等类型需要转换 item[1]=str(newValue1[i]) j = bisect.bisect(breakpoint2,int(item[2])) item[2]=str(newValue1[j]) #数据预处理,分箱,针对单个list数据,data为list['..','..']类型 def dataPreprocessing2(data): breakpoint1 =[70,80] breakpoint2=[80,90] newValue1='LMH' i = bisect.bisect(breakpoint1,int(data[1])) data[1]=str(newValue1[i]) j = bisect.bisect(breakpoint2,int(data[2])) data[2]=str(newValue1[j]) if __name__ =='__main__': fileName=r'C:\Users\Administrator\Desktop\exepirenment\classifill\data\weather.arff' readArff(fileName) dataPreprocessing1(data) testData =['overcast','72','80','TRUE'] dataPreprocessing2(testData) bayesion(testData)
The result is : Yes