朴素贝叶斯

import random

import numpy as np
# Read the data from a file, return the header and the data (nominal values)
def readNominalData(paraFilename):
    resultData = []
    tempFile = open(paraFilename)
    tempLine = tempFile.readline().replace('\n', '')

    tempNames = np.array(tempLine.split(','))
    resultNames = [tempValue for tempValue in tempNames]

    # print("resultNames ", resultNames)
    # print("resultNames[0] = ", resultNames[0])

    tempLine = tempFile.readline().replace('\n', '')
    while tempLine != '':
        tempValues = np.array(tempLine.split(','))
        tempArray = [tempValue for tempValue in tempValues]
        resultData.append(tempArray)
        tempLine = tempFile.readline().replace('\n', '')

    # print(resultData)

    tempFile.close()
    return resultNames, resultData


# Obtain all values of all features (including the decision) as a matrix
def obtainFeaturesValues(paraDataset):
    resultMatrix = []
    for i in range(len(paraDataset[0])):
        featureValues = [example[i] for example in paraDataset]
        uniqueValues = set(featureValues)
        # print("uniqueValues = ", uniqueValues)
        currentValues = [tempValue for tempValue in uniqueValues]
        # print("currentValues = ", currentValues)
        resultMatrix.append(currentValues)
    # print("The values matrix is: ", resultMatrix)
    return resultMatrix


# Calculate class values
def calculateClassCounts(paraData, paraValuesMatrix):
    classCount = {}
    tempNumInstances = len(paraData)
    tempNumClasses = len(paraValuesMatrix[-1])

    for i in range(tempNumInstances):
        tempClass = paraData[i][-1]
        if tempClass not in classCount.keys():
            classCount[tempClass] = 0
        classCount[tempClass] += 1

    resultCounts = np.array(classCount)
    return resultCounts


# Calculate distributions,类概率分布
def calculateClassDistributionLaplacian(paraData, paraValuesMatrix):
    classCount = {}
    tempNumInstances = len(paraData)
    tempNumClasses = len(paraValuesMatrix[-1])

    for i in range(tempNumInstances):
        tempClass = paraData[i][-1]
        if tempClass not in classCount.keys():
            classCount[tempClass] = 0
        classCount[tempClass] += 1

    resultClassDistribution = []
    for tempValue in paraValuesMatrix[-1]:
        resultClassDistribution.append((classCount[tempValue] + 1.0) / (tempNumInstances + tempNumClasses))

    print("tempNumClasses", tempNumClasses)

    return resultClassDistribution


def calculateMappings(paraValuesMatrix):
    resultMappings = []
    for i in range(len(paraValuesMatrix)):
        tempMapping = {}
        for j in range(len(paraValuesMatrix[i])):
            tempMapping[paraValuesMatrix[i][j]] = j
        resultMappings.append(tempMapping)
    # print("tempMappings", resultMappings)
    return resultMappings


# Calculate distributions, 计算条件概率
def calculateConditionalDistributionsLaplacian(paraData, paraValuesMatrix, paraMappings):
    tempNumInstances = len(paraData)
    tempNumConditions = len(paraData[0]) - 1
    tempNumClasses = len(paraValuesMatrix[-1])

    # Step 1. Allocate space
    tempCountCubic = []
    resultDistributionsLaplacianCubic = []
    for i in range(tempNumClasses):
        tempMatrix = []
        tempMatrix2 = []
        # Over all conditions
        for j in range(tempNumConditions):
            # Over all values
            tempNumValues = len(paraValuesMatrix[j])
            tempArray = [0.0] * tempNumValues
            tempArray2 = [0.0] * tempNumValues
            tempMatrix.append(tempArray)
            tempMatrix2.append(tempArray2)
        tempCountCubic.append(tempMatrix)
        resultDistributionsLaplacianCubic.append(tempMatrix2)

    # Step 2. Scan the dataset
    for i in range(tempNumInstances):
        tempClass = paraData[i][-1]
        # print("tempClass = ", tempClass)
        tempIntClass = paraMappings[tempNumConditions][tempClass]
        for j in range(tempNumConditions):
            tempValue = paraData[i][j]
            tempIntValue = paraMappings[j][tempValue]
            tempCountCubic[tempIntClass][j][tempIntValue] += 1

    # Step 3. Calculate the real probability with Laplacian
    tempClassCounts = [0] * tempNumClasses
    for i in range(tempNumInstances):
        tempValue = paraData[i][-1]
        tempIntValue = paraMappings[tempNumConditions][tempValue]
        tempClassCounts[tempIntValue] += 1

    for i in range(tempNumClasses):
        # Over all conditions
        for j in range(tempNumConditions):
            for k in range(len(tempCountCubic[i][j])):
                resultDistributionsLaplacianCubic[i][j][k] = (tempCountCubic[i][j][k] + 1) / (
                            tempClassCounts[i] + tempNumClasses)

    # print("tempCountCubic", tempCountCubic)
    # print("resultDistributionsLaplacianCubic", resultDistributionsLaplacianCubic)

    return resultDistributionsLaplacianCubic


# Classification, 分类
def nbClassify(paraTestData, paraValuesMatrix, paraClassValues, paraMappings, paraClassDistribution,
               paraDistributionCubic):
    tempCorrect = 0.0
    tempNumInstances = len(paraTestData)
    tempNumConditions = len(paraTestData[0]) - 1
    tempNumClasses = len(paraValuesMatrix[-1])

    tempTotal = len(paraTestData)

    tempBiggest = -1000
    tempBest = -1

    # print("paraMapping", paraMappings)

    # All instances
    for featureVector in paraTestData:
        # print("featureVector[-1]", featureVector[-1])
        tempActualLabel = paraMappings[tempNumConditions][featureVector[-1]]
        # print("tempActualLabel ", tempActualLabel)

        tempBiggest = -1000
        tempBest = -1
        for i in range(tempNumClasses):
            tempPseudoProbability = np.log(paraClassDistribution[i])
            for j in range(tempNumConditions):
                tempValue = featureVector[j]
                tempIntValue = paraMappings[j][tempValue]
                tempPseudoProbability += np.log(paraDistributionCubic[i][j][tempIntValue])

            if tempBiggest < tempPseudoProbability:
                tempBiggest = tempPseudoProbability
                tempBest = i

        # Is the prediction correct?
        # print("tempBest = {} and tempActualLabel = {}".format(tempBest, tempActualLabel))
        if tempBest == tempActualLabel:
            tempCorrect += 1

    return tempCorrect / tempNumInstances


def mfNBTest(paraFilename):
    # Step 1. Load the dataset
    featureNames, dataset = readNominalData(paraFilename)


    train_dataset = []
    test_dataset = []
    # train_dataset.append(featureNames)
    # test_dataset.append(featureNames)

    for i in range(len(dataset)):
        x = random.random()
        if x < 0.8:
            train_dataset.append(dataset[i])
        elif x > 0.8:
            test_dataset.append(dataset[i])

    # classValues = ['P', 'N']

    print("featureNames = ", featureNames)
    # print("dataset = ", dataset)

    valuesMatrix = obtainFeaturesValues(train_dataset)
    tempMappings = calculateMappings(valuesMatrix)
    classValues = calculateClassCounts(train_dataset, valuesMatrix)

    classDistribution = calculateClassDistributionLaplacian(train_dataset, valuesMatrix)
    print("classDistribution = ", classDistribution)

    conditionalDistributions = calculateConditionalDistributionsLaplacian(train_dataset, valuesMatrix, tempMappings)
    # print("conditionalDistributions = ", conditionalDistributions)

    # print("valuesMatrix[0][1] = ", valuesMatrix[0][1])

    # featureName = ['Outlook', 'Temperature', 'Humidity', 'Windy']
    # print("Before classification, feature names = ", featureNames)
    tempAccuracy = nbClassify(test_dataset, valuesMatrix, classValues, tempMappings, classDistribution,
                              conditionalDistributions)
    print("The accuracy of NB classifier is {}".format(tempAccuracy))


def main():
    # sklearnNBTest()
    # mfNBTest('weather.csv')
    mfNBTest('./mushroom.csv')
    # readData("weather.csv")


main()

你可能感兴趣的:(python,机器学习,numpy)