朴素贝叶斯——UCI数据集IRIS

采用朴素贝叶斯方法进行学习,原始数据共150组,拿120个作为训练集,另外30个作为测试集合。

对于特征值的处理:

1、离散化。具体做法是:找到每个特征值的中位数,把其当做阈值,小于它和大于它相当于不同的取值。

2、 采用朴素贝叶斯方法进行学习。因为特征值是连续值,假设每个特征都满足高斯分布,用高斯函数来估计。

离散化版本:

#include <cstdio>
#include <algorithm>
#include <cstring>
using namespace std;
#define clr(s,t) memset(s,t,sizeof(s));
#define N 1000
#define D 30
#define TRAIN 120
#define TEST 30
double data[N][5],t[N],bound[D];
char kind[N][100],str[D][100];
int out[N],prior[N],condition[10][D][N],len;
int find(char *x){
    int i;
    for(i = 0;i<=len;i++)
        if(!strcmp(str[i], x))
            return i;
    strcpy(str[++len], x);
    return len;
}
void learning(){
    int i,j;
    clr(condition, 0);
    clr(prior, 0);
    len = -1;
    for(i = 0;i<TRAIN;i++)
        scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]);
    for(i = 0;i<4;i++){                     //将每个属性二分,记录在bound数组
        for(j = 0;j<TRAIN;j++)
            t[j] = data[j][i];
        nth_element(t ,t+TRAIN/2, t+TRAIN);
        bound[i] = t[TRAIN/2];
    }
    for(i = 0;i<TRAIN;i++){                 //统计类标记的数量,计算先验概率
        j = find(kind[i]);
        out[i] = j;
        prior[j]++;
    }
    for(i = 0;i<TRAIN;i++)
        for(j = 0;j<4;j++){
            int k = data[i][j]>bound[j];    //如果小于相应的bound,记为0,否则为1
            condition[out[i]][j][k]++;
        }
}
double compute(int y,int d,double x){       //标记为y的情况下,第d个特征值为x的条件概率
    return (double)condition[y][d][x>bound[d]]/prior[y];
}
int guess(double d[]){
    int i,j,ans;
    double res = 0,now;
    for(i = 0;i<=len;i++){
        now = 1;
        for(j = 0;j<4;j++)
            now *= compute(i,j,d[j]);
        now *= (double)prior[i]/TRAIN;
        if(res < now){                      //找到概率最大的那个作为预测
            res = now;
            ans = i;
        }
    }
    return ans;
}
void classification(){
    int i,j,k,num=0;
    for(i = 0;i<TEST;i++){
        scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]);
        if(i == 10)
            printf("Sdf");
        j = guess(data[i]);
        k = find(kind[i]);
        if(j == k)
            num++;
        printf("第%d个我预测为%s,实际为%s\n",i+1,str[j],kind[i]);
    }
    printf("正确率为:%lf\n",(double)num/TEST);
}
int main(){
    freopen("/Users/hetianjian/Desktop/ML/bayes/in5.txt","r",stdin);
    learning();
    classification();
    return 0;
}

高斯分布版本:

#include <cstdio>
#include <algorithm>
#include <cstring>
#include <cmath>
using namespace std;
#define clr(s,t) memset(s,t,sizeof(s));
#define N 1000
#define D 30
#define TRAIN 120
#define TEST 30
#define PI acos(-1.)
double data[N][5],mean[10][D],var[10][D];
char kind[N][100],str[D][100];
int out[N],prior[N],len;
int find(char *x){
    for(int i = 0;i<=len;i++)
        if(!strcmp(str[i], x))
            return i;
    strcpy(str[++len], x);
    return len;
}
void learning(){
    int i,j;
    clr(prior, 0);
    clr(mean, 0);
    clr(var, 0);
    len = -1;
    for(i = 0;i<TRAIN;i++){
        scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]);
        j = find(kind[i]);                  //标记训练集的类别
        out[i] = j;
        prior[j]++;                         //类别的先验概率
    }
    for(i = 0;i<TRAIN;i++)                  //计算均值和方差:mean[i][j]表示输出类别为i,特征j的均值,var为方差
        for(j = 0;j<4;j++){
            mean[out[i]][j] += data[i][j];
            var[out[i]][j] += data[i][j]*data[i][j];
        }
    for(i = 0;i<=len;i++)
        for(j = 0;j<4;j++){
            mean[i][j] /= prior[i];
            var[i][j] = var[i][j]/prior[i] - mean[i][j]*mean[i][j];
        }
}
double gauss(double x,double u,double var){//高斯函数
    return exp(-(double)(x-u)*(x-u)/2./var)/sqrt(2.*PI*var);
}
int guess(double d[]){
    int i,j,ans;
    double res = 0,now;
    for(i = 0;i<=len;i++){
        now = 1;
        for(j = 0;j<4;j++)
            now *= gauss(d[j],mean[i][j],var[i][j]);
        now *= (double)prior[i]/TRAIN;
        if(res < now){                      //找到概率最大的那个作为预测
            res = now;
            ans = i;
        }
    }
    return ans;
}
void classification(){
    int i,j,k,num=0;
    for(i = 0;i<TEST;i++){
        scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]);
        j = guess(data[i]);
        k = find(kind[i]);
        if(j == k)
            num++;
        printf("第%d个我预测为%s,实际为%s\n",i+1,str[j],kind[i]);
    }
    printf("正确率为:%lf\n",(double)num/TEST);
}
int main(){
    freopen("/Users/hetianjian/Desktop/ML/bayes/in4.txt","r",stdin);
    learning();
    classification();
    return 0;
}


你可能感兴趣的:(朴素贝叶斯——UCI数据集IRIS)