import random import numpy as np # Read the data from a file, return the header and the data (nominal values) def readNominalData(paraFilename): resultData = [] tempFile = open(paraFilename) tempLine = tempFile.readline().replace('\n', '') tempNames = np.array(tempLine.split(',')) resultNames = [tempValue for tempValue in tempNames] # print("resultNames ", resultNames) # print("resultNames[0] = ", resultNames[0]) tempLine = tempFile.readline().replace('\n', '') while tempLine != '': tempValues = np.array(tempLine.split(',')) tempArray = [tempValue for tempValue in tempValues] resultData.append(tempArray) tempLine = tempFile.readline().replace('\n', '') # print(resultData) tempFile.close() return resultNames, resultData # Obtain all values of all features (including the decision) as a matrix def obtainFeaturesValues(paraDataset): resultMatrix = [] for i in range(len(paraDataset[0])): featureValues = [example[i] for example in paraDataset] uniqueValues = set(featureValues) # print("uniqueValues = ", uniqueValues) currentValues = [tempValue for tempValue in uniqueValues] # print("currentValues = ", currentValues) resultMatrix.append(currentValues) # print("The values matrix is: ", resultMatrix) return resultMatrix # Calculate class values def calculateClassCounts(paraData, paraValuesMatrix): classCount = {} tempNumInstances = len(paraData) tempNumClasses = len(paraValuesMatrix[-1]) for i in range(tempNumInstances): tempClass = paraData[i][-1] if tempClass not in classCount.keys(): classCount[tempClass] = 0 classCount[tempClass] += 1 resultCounts = np.array(classCount) return resultCounts # Calculate distributions,类概率分布 def calculateClassDistributionLaplacian(paraData, paraValuesMatrix): classCount = {} tempNumInstances = len(paraData) tempNumClasses = len(paraValuesMatrix[-1]) for i in range(tempNumInstances): tempClass = paraData[i][-1] if tempClass not in classCount.keys(): classCount[tempClass] = 0 classCount[tempClass] += 1 resultClassDistribution = [] for tempValue in paraValuesMatrix[-1]: resultClassDistribution.append((classCount[tempValue] + 1.0) / (tempNumInstances + tempNumClasses)) print("tempNumClasses", tempNumClasses) return resultClassDistribution def calculateMappings(paraValuesMatrix): resultMappings = [] for i in range(len(paraValuesMatrix)): tempMapping = {} for j in range(len(paraValuesMatrix[i])): tempMapping[paraValuesMatrix[i][j]] = j resultMappings.append(tempMapping) # print("tempMappings", resultMappings) return resultMappings # Calculate distributions, 计算条件概率 def calculateConditionalDistributionsLaplacian(paraData, paraValuesMatrix, paraMappings): tempNumInstances = len(paraData) tempNumConditions = len(paraData[0]) - 1 tempNumClasses = len(paraValuesMatrix[-1]) # Step 1. Allocate space tempCountCubic = [] resultDistributionsLaplacianCubic = [] for i in range(tempNumClasses): tempMatrix = [] tempMatrix2 = [] # Over all conditions for j in range(tempNumConditions): # Over all values tempNumValues = len(paraValuesMatrix[j]) tempArray = [0.0] * tempNumValues tempArray2 = [0.0] * tempNumValues tempMatrix.append(tempArray) tempMatrix2.append(tempArray2) tempCountCubic.append(tempMatrix) resultDistributionsLaplacianCubic.append(tempMatrix2) # Step 2. Scan the dataset for i in range(tempNumInstances): tempClass = paraData[i][-1] # print("tempClass = ", tempClass) tempIntClass = paraMappings[tempNumConditions][tempClass] for j in range(tempNumConditions): tempValue = paraData[i][j] tempIntValue = paraMappings[j][tempValue] tempCountCubic[tempIntClass][j][tempIntValue] += 1 # Step 3. Calculate the real probability with Laplacian tempClassCounts = [0] * tempNumClasses for i in range(tempNumInstances): tempValue = paraData[i][-1] tempIntValue = paraMappings[tempNumConditions][tempValue] tempClassCounts[tempIntValue] += 1 for i in range(tempNumClasses): # Over all conditions for j in range(tempNumConditions): for k in range(len(tempCountCubic[i][j])): resultDistributionsLaplacianCubic[i][j][k] = (tempCountCubic[i][j][k] + 1) / ( tempClassCounts[i] + tempNumClasses) # print("tempCountCubic", tempCountCubic) # print("resultDistributionsLaplacianCubic", resultDistributionsLaplacianCubic) return resultDistributionsLaplacianCubic # Classification, 分类 def nbClassify(paraTestData, paraValuesMatrix, paraClassValues, paraMappings, paraClassDistribution, paraDistributionCubic): tempCorrect = 0.0 tempNumInstances = len(paraTestData) tempNumConditions = len(paraTestData[0]) - 1 tempNumClasses = len(paraValuesMatrix[-1]) tempTotal = len(paraTestData) tempBiggest = -1000 tempBest = -1 # print("paraMapping", paraMappings) # All instances for featureVector in paraTestData: # print("featureVector[-1]", featureVector[-1]) tempActualLabel = paraMappings[tempNumConditions][featureVector[-1]] # print("tempActualLabel ", tempActualLabel) tempBiggest = -1000 tempBest = -1 for i in range(tempNumClasses): tempPseudoProbability = np.log(paraClassDistribution[i]) for j in range(tempNumConditions): tempValue = featureVector[j] tempIntValue = paraMappings[j][tempValue] tempPseudoProbability += np.log(paraDistributionCubic[i][j][tempIntValue]) if tempBiggest < tempPseudoProbability: tempBiggest = tempPseudoProbability tempBest = i # Is the prediction correct? # print("tempBest = {} and tempActualLabel = {}".format(tempBest, tempActualLabel)) if tempBest == tempActualLabel: tempCorrect += 1 return tempCorrect / tempNumInstances def mfNBTest(paraFilename): # Step 1. Load the dataset featureNames, dataset = readNominalData(paraFilename) train_dataset = [] test_dataset = [] # train_dataset.append(featureNames) # test_dataset.append(featureNames) for i in range(len(dataset)): x = random.random() if x < 0.8: train_dataset.append(dataset[i]) elif x > 0.8: test_dataset.append(dataset[i]) # classValues = ['P', 'N'] print("featureNames = ", featureNames) # print("dataset = ", dataset) valuesMatrix = obtainFeaturesValues(train_dataset) tempMappings = calculateMappings(valuesMatrix) classValues = calculateClassCounts(train_dataset, valuesMatrix) classDistribution = calculateClassDistributionLaplacian(train_dataset, valuesMatrix) print("classDistribution = ", classDistribution) conditionalDistributions = calculateConditionalDistributionsLaplacian(train_dataset, valuesMatrix, tempMappings) # print("conditionalDistributions = ", conditionalDistributions) # print("valuesMatrix[0][1] = ", valuesMatrix[0][1]) # featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy'] # print("Before classification, feature names = ", featureNames) tempAccuracy = nbClassify(test_dataset, valuesMatrix, classValues, tempMappings, classDistribution, conditionalDistributions) print("The accuracy of NB classifier is {}".format(tempAccuracy)) def main(): # sklearnNBTest() # mfNBTest('weather.csv') mfNBTest('./mushroom.csv') # readData("weather.csv") main()