学习朴素贝叶斯算法与交叉验证C实现

C语言程序设计课之朴素贝叶斯算法


第一次接触数据挖掘,觉得好有意思啊


测试了泰坦尼克号的数据,只取了年龄和性别这两个预测属性。

测试准确率78%

还是很棒棒的




算法核心:


朴素贝叶斯算法:

学习朴素贝叶斯算法与交叉验证C实现_第1张图片

这个公式提供给了我们一个方法

当我们知道在B下A发生的概率时,可以预测A下发生B的概率。

就拿这个泰坦尼克号的训练集来说。

可以算出在死亡下发生年龄和性别的概率和在生存下发生年龄和性别的概率。

之后根据贝叶斯公式可以算出在在年龄和生存下发生死亡或者生存的概率

根据样本的情况算出其死亡和生存的值

其较大的就是预测结果


#include
#include
#include

#define K 10
#define DATASIZE 800
int people[1000][3];

void showme(int n)
{
    int i,k;
    for(i = 0; i < n; i++)
    {
        for(k = 0; k < 3; k++)
        {
            printf("%d ", people[i][k]);

        }
        printf("\n");
    }
}

int getData() //get the file data into internal storage
{
    FILE *fp;

    if((fp = fopen("C:\\Users\\XueChuanyu\\Desktop\\test.txt","r")) == NULL)
    {
        printf("cant find the file");
        return 1;
    }

    int i = 0, k = 0;

    for(; i < DATASIZE; i++)
    {
        for(k = 0; k < 3; k++)
        {
            fscanf(fp, "%d", &people[i][k]);

            if(k == 2)
            {
                if(people[i][k] == 0)
                {
                    people[i][k] = 30;
                }

                people[i][k] = changeAge(people[i][k]);
            }

        }
    }

    //showme(n);
    fclose(fp);

    //changeAge(n);

    return 0;

}


int changeAge(int x)// to make the data of age discretized
{

    if(x < 15)
    {
        x = 0;
        return x;
    }
    else
    {
        if(15 <= x && x <= 30)
        {
            x = 1;
            return x;
        }
        else
        {
            if(30 < x && x <= 45)
            {
                x = 2;
                return x;
            }
            else
            {
                x = 3;
                return x;
            }
        }
    }
}



int checker(int start, int end, int typeSon, int typeDad, int sampleSon, int sampleDad)//check in [start, end)
{
    int i, count = 0;

    for(i = 0; i < start; i++)
    {
        if(people[i][typeSon] == sampleSon && people[i][typeDad] == sampleDad)
        {
            count++;
        }
    }

    for(i = end; i < DATASIZE; i++)
    {
        if(people[i][typeSon] == sampleSon && people[i][typeDad] == sampleDad)
        {
            count++;
        }
    }

    return count;
}

double kernel(int start, int end, int Dead, int Sex, int Age)
{
    double amountDead = checker(start, end, 0, 0, Dead, Dead);
    double amountSex = checker(start, end, 1, 0, Sex, Dead);
    double amountAge = checker(start, end, 2, 0, Age, Dead);

    return (amountSex / amountDead) * (amountAge / amountDead) * (amountDead / DATASIZE);

}

int compare(int start, int end, int sex, int age)
{
    double deadResult = kernel(start, end, 0, sex, age);
    double aliveResult = kernel(start, end, 1, sex, age);

    if(deadResult > aliveResult)
    {
        return 0;
    }
    else
    {
        return 1;
    }
}

void corssTest()
{
    int amountSample = DATASIZE / K;
    int correctPredict = 0;
    double result = 0;

    int i, k;
    for(i = 0; i < K; i++)
    {
        int start = i * amountSample;
        int end = (i + 1) * amountSample;
        correctPredict = 0;

        for(k = start; k < end; k++)
        {
            if(people[k][0] == compare(start, end, people[k][1], people[k][2]))
            {
                correctPredict++;
            }

        }

        result = result + (double)correctPredict / (double)amountSample;
    }

    double possiblity = result / K;
    printf("The accuracy of this mode is %lf", possiblity);

}

int main(void)
{

    getData();
    //showme(DATASIZE);
    corssTest();

    return 0;
}


你可能感兴趣的:(DataMining学习笔记)