r 语言 c50算法,c50(部分)决策树R代码

#c50决策树作为商业版本的决策树,尤其高效的速度和,更加符合理论意义的分类方式

#信息熵,信息熵增益率作为样本数据的分支方式,下文中没有对连续型变量做输入,可以参考chimerge转化为分类型变量

#如果想转变为2叉树形式可以参考以gini或者信息熵增益率作为分类方式

#最终形成的分类数做减枝的参考是子误差加权后比父误差要小,则分类延续,否则剪枝

#w1=matrix(sign(rnorm(120)),15,8,1)

info_entropy

{

#print(x);

#print(y);

z=as.matrix(cbind(x,y));

n=ncol(z);

m_test=c();

m2=c();

m_sel=c();

m_sel2=c();

m_test2=c();

m_sel3=c();

m_rat=c();

for(i in 1:(n-1))

{

m2

m2=ftable(z[,n],z[,i]);

m_test=m2/rep(colSums(m2),each=nrow(m2));#对应的条件概率

m_test2=rep(colSums(m2)/(sum(m2)),each=nrow(m2))*(-m_test*log(base=2,m_test));#对应条件熵

m_test2[which(is.nan(m_test2))]

#条件熵

m_sel

#信息熵y

m_test0

m_sel2

m_sel2[which(is.nan(m_sel2))]

m_sel2

#信息熵x

m_test0

m_sel3

m_sel3[which(is.nan(m_sel3))]

m_sel3=sum(m_sel3);

#信息熵增益

if((m_sel2-m_sel)==0 & m_sel3==0)

{

m_rat=append(m_rat,10000);

}

else

{

#print((m_sel2-m_sel)/m_sel3);

m_rat=append(m_rat,(m_sel2-m_sel)/(m_sel3+1e-10));

#print("m_rat");

#print(m_rat);

}

}

return(m_rat)

}

error_emite

{

ei

ei0

which.max(x));#获取众数为正确预测

ei1

#print(paste("length(ei0):",length(ei0)));

for(i in 1:length(ei0))

{

ei=append(ei,ei1[i]-table(x,y)[i,ei0[i]]);

#print(ei);

}

ni

error1=ei/ni+sqrt(ei/ni*(1-ei/ni)/ni);

#print(error1*(ni/sum(ni)));

return

}

#error_emite(m3[,5],m3[,6]);

#ifelse对矩阵无效替换函数

ifelse1

{

c=c();

if(test==T)

{

c=yes;

c;

}

else

{ no;}

}

son_father

{

kk=kk+1;

#print("son_fater");

#print(xn);

#print(yn);

x

y

info_descsion=c();

info_set

info_set2

error_son

set0

#print("info_entropy(x,y)");

info_descsion

info_set

#print(info_descsion);

#print(info_set);

info_set2

x2=t(t(x[, -info_set]));

error_son

#比较父节点和子节点的误差

#print(error_son);

nc=ncol(x2);

#print("x2");

#print(x2);

#print("x");

#print(x[,-info_set]);

#print(info_set2);

#print(is.na(x2));

if(all(is.na(x2)))

{

nc=0;

}

if(son_fa>error_son & nc==0)#如果是最后一个则输出

{

print(paste("分类树底层结点 , ",info_set," , 父亲偏差:",son_fa,

" , 孩子偏差:",error_son," 深度:",kk," 分枝条件条件:",choice1,sep=""));

}

else

{

if(son_fa

{

print(paste("无子结点 , ",info_set," , 父亲偏差:" ,son_fa,

", 孩子偏差:",error_son," 深度:",kk," 分枝条件:","无",sep=""));

}

else

{

if(son_fa

{

print(paste("分类树底层无结点 , ",info_set," , 父亲偏差:" ,son_fa,

", 孩子偏差:",error_son," 深度:",kk," 分枝条件:","无",sep=""));

}

else

{

if(son_fa>error_son & nc!=0)#判断误差分支条件并执行是否继续分枝

{

for(i in 1:length(info_set2))#获取已定子节点的所有误差

{

print(paste("连续结点 , ",info_set," , 父亲偏差:" ,son_fa,

", 孩子偏差:",error_son," 深度:",kk," 分枝条件:",info_set2[i],sep=""));

#print("i");

#print(i);

#print(kk);

#print(x2);

set0

#print("info_set2");

#print(info_set2);

y1

#x2

x3

error_father1

#print(error_father1);

#print("x2");

#print(x2);

#print("y1");

#print(y1);

#print(length(set0));

#print("set0");

#print(set0);

#print(x2);

#ifelse1(test=(length(set0)==1),yes=t(as.matrix(x2[set0,])),no=x2[set0,])

#print(ifelse1(test=length(set0)==1,yes=t(x2[set0,]),no=x2[set0,]));

#print(y1);

fuzhi

}

}

}

}

}

}

c501

{

x1

#计算当前层的误差

son_father(x1,y);

}

# c501(m3[,1:5],m3[,6])

w1=matrix(sign(rnorm(60000)),3000,20,1);

c501(w1[,1:19],w1[,20]);

你可能感兴趣的:(r,语言,c50算法)