对于特征值的处理:
1、离散化。具体做法是:找到每个特征值的中位数,把其当做阈值,小于它和大于它相当于不同的取值。
2、 采用朴素贝叶斯方法进行学习。因为特征值是连续值,假设每个特征都满足高斯分布,用高斯函数来估计。
离散化版本:
#include <cstdio> #include <algorithm> #include <cstring> using namespace std; #define clr(s,t) memset(s,t,sizeof(s)); #define N 1000 #define D 30 #define TRAIN 120 #define TEST 30 double data[N][5],t[N],bound[D]; char kind[N][100],str[D][100]; int out[N],prior[N],condition[10][D][N],len; int find(char *x){ int i; for(i = 0;i<=len;i++) if(!strcmp(str[i], x)) return i; strcpy(str[++len], x); return len; } void learning(){ int i,j; clr(condition, 0); clr(prior, 0); len = -1; for(i = 0;i<TRAIN;i++) scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]); for(i = 0;i<4;i++){ //将每个属性二分,记录在bound数组 for(j = 0;j<TRAIN;j++) t[j] = data[j][i]; nth_element(t ,t+TRAIN/2, t+TRAIN); bound[i] = t[TRAIN/2]; } for(i = 0;i<TRAIN;i++){ //统计类标记的数量,计算先验概率 j = find(kind[i]); out[i] = j; prior[j]++; } for(i = 0;i<TRAIN;i++) for(j = 0;j<4;j++){ int k = data[i][j]>bound[j]; //如果小于相应的bound,记为0,否则为1 condition[out[i]][j][k]++; } } double compute(int y,int d,double x){ //标记为y的情况下,第d个特征值为x的条件概率 return (double)condition[y][d][x>bound[d]]/prior[y]; } int guess(double d[]){ int i,j,ans; double res = 0,now; for(i = 0;i<=len;i++){ now = 1; for(j = 0;j<4;j++) now *= compute(i,j,d[j]); now *= (double)prior[i]/TRAIN; if(res < now){ //找到概率最大的那个作为预测 res = now; ans = i; } } return ans; } void classification(){ int i,j,k,num=0; for(i = 0;i<TEST;i++){ scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]); if(i == 10) printf("Sdf"); j = guess(data[i]); k = find(kind[i]); if(j == k) num++; printf("第%d个我预测为%s,实际为%s\n",i+1,str[j],kind[i]); } printf("正确率为:%lf\n",(double)num/TEST); } int main(){ freopen("/Users/hetianjian/Desktop/ML/bayes/in5.txt","r",stdin); learning(); classification(); return 0; }
#include <cstdio> #include <algorithm> #include <cstring> #include <cmath> using namespace std; #define clr(s,t) memset(s,t,sizeof(s)); #define N 1000 #define D 30 #define TRAIN 120 #define TEST 30 #define PI acos(-1.) double data[N][5],mean[10][D],var[10][D]; char kind[N][100],str[D][100]; int out[N],prior[N],len; int find(char *x){ for(int i = 0;i<=len;i++) if(!strcmp(str[i], x)) return i; strcpy(str[++len], x); return len; } void learning(){ int i,j; clr(prior, 0); clr(mean, 0); clr(var, 0); len = -1; for(i = 0;i<TRAIN;i++){ scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]); j = find(kind[i]); //标记训练集的类别 out[i] = j; prior[j]++; //类别的先验概率 } for(i = 0;i<TRAIN;i++) //计算均值和方差:mean[i][j]表示输出类别为i,特征j的均值,var为方差 for(j = 0;j<4;j++){ mean[out[i]][j] += data[i][j]; var[out[i]][j] += data[i][j]*data[i][j]; } for(i = 0;i<=len;i++) for(j = 0;j<4;j++){ mean[i][j] /= prior[i]; var[i][j] = var[i][j]/prior[i] - mean[i][j]*mean[i][j]; } } double gauss(double x,double u,double var){//高斯函数 return exp(-(double)(x-u)*(x-u)/2./var)/sqrt(2.*PI*var); } int guess(double d[]){ int i,j,ans; double res = 0,now; for(i = 0;i<=len;i++){ now = 1; for(j = 0;j<4;j++) now *= gauss(d[j],mean[i][j],var[i][j]); now *= (double)prior[i]/TRAIN; if(res < now){ //找到概率最大的那个作为预测 res = now; ans = i; } } return ans; } void classification(){ int i,j,k,num=0; for(i = 0;i<TEST;i++){ scanf("%lf,%lf,%lf,%lf,%s\n",&data[i][0],&data[i][1],&data[i][2],&data[i][3],kind[i]); j = guess(data[i]); k = find(kind[i]); if(j == k) num++; printf("第%d个我预测为%s,实际为%s\n",i+1,str[j],kind[i]); } printf("正确率为:%lf\n",(double)num/TEST); } int main(){ freopen("/Users/hetianjian/Desktop/ML/bayes/in4.txt","r",stdin); learning(); classification(); return 0; }