#c50决策树作为商业版本的决策树,尤其高效的速度和,更加符合理论意义的分类方式
#信息熵,信息熵增益率作为样本数据的分支方式,下文中没有对连续型变量做输入,可以参考chimerge转化为分类型变量
#如果想转变为2叉树形式可以参考以gini或者信息熵增益率作为分类方式
#最终形成的分类数做减枝的参考是子误差加权后比父误差要小,则分类延续,否则剪枝
#w1=matrix(sign(rnorm(120)),15,8,1)
info_entropy
{
#print(x);
#print(y);
z=as.matrix(cbind(x,y));
n=ncol(z);
m_test=c();
m2=c();
m_sel=c();
m_sel2=c();
m_test2=c();
m_sel3=c();
m_rat=c();
for(i in 1:(n-1))
{
m2
m2=ftable(z[,n],z[,i]);
m_test=m2/rep(colSums(m2),each=nrow(m2));#对应的条件概率
m_test2=rep(colSums(m2)/(sum(m2)),each=nrow(m2))*(-m_test*log(base=2,m_test));#对应条件熵
m_test2[which(is.nan(m_test2))]
#条件熵
m_sel
#信息熵y
m_test0
m_sel2
m_sel2[which(is.nan(m_sel2))]
m_sel2
#信息熵x
m_test0
m_sel3
m_sel3[which(is.nan(m_sel3))]
m_sel3=sum(m_sel3);
#信息熵增益
if((m_sel2-m_sel)==0 & m_sel3==0)
{
m_rat=append(m_rat,10000);
}
else
{
#print((m_sel2-m_sel)/m_sel3);
m_rat=append(m_rat,(m_sel2-m_sel)/(m_sel3+1e-10));
#print("m_rat");
#print(m_rat);
}
}
return(m_rat)
}
error_emite
{
ei
ei0
which.max(x));#获取众数为正确预测
ei1
#print(paste("length(ei0):",length(ei0)));
for(i in 1:length(ei0))
{
ei=append(ei,ei1[i]-table(x,y)[i,ei0[i]]);
#print(ei);
}
ni
error1=ei/ni+sqrt(ei/ni*(1-ei/ni)/ni);
#print(error1*(ni/sum(ni)));
return
}
#error_emite(m3[,5],m3[,6]);
#ifelse对矩阵无效替换函数
ifelse1
{
c=c();
if(test==T)
{
c=yes;
c;
}
else
{ no;}
}
son_father
{
kk=kk+1;
#print("son_fater");
#print(xn);
#print(yn);
x
y
info_descsion=c();
info_set
info_set2
error_son
set0
#print("info_entropy(x,y)");
info_descsion
info_set
#print(info_descsion);
#print(info_set);
info_set2
x2=t(t(x[, -info_set]));
error_son
#比较父节点和子节点的误差
#print(error_son);
nc=ncol(x2);
#print("x2");
#print(x2);
#print("x");
#print(x[,-info_set]);
#print(info_set2);
#print(is.na(x2));
if(all(is.na(x2)))
{
nc=0;
}
if(son_fa>error_son & nc==0)#如果是最后一个则输出
{
print(paste("分类树底层结点 , ",info_set," , 父亲偏差:",son_fa,
" , 孩子偏差:",error_son," 深度:",kk," 分枝条件条件:",choice1,sep=""));
}
else
{
if(son_fa
{
print(paste("无子结点 , ",info_set," , 父亲偏差:" ,son_fa,
", 孩子偏差:",error_son," 深度:",kk," 分枝条件:","无",sep=""));
}
else
{
if(son_fa
{
print(paste("分类树底层无结点 , ",info_set," , 父亲偏差:" ,son_fa,
", 孩子偏差:",error_son," 深度:",kk," 分枝条件:","无",sep=""));
}
else
{
if(son_fa>error_son & nc!=0)#判断误差分支条件并执行是否继续分枝
{
for(i in 1:length(info_set2))#获取已定子节点的所有误差
{
print(paste("连续结点 , ",info_set," , 父亲偏差:" ,son_fa,
", 孩子偏差:",error_son," 深度:",kk," 分枝条件:",info_set2[i],sep=""));
#print("i");
#print(i);
#print(kk);
#print(x2);
set0
#print("info_set2");
#print(info_set2);
y1
#x2
x3
error_father1
#print(error_father1);
#print("x2");
#print(x2);
#print("y1");
#print(y1);
#print(length(set0));
#print("set0");
#print(set0);
#print(x2);
#ifelse1(test=(length(set0)==1),yes=t(as.matrix(x2[set0,])),no=x2[set0,])
#print(ifelse1(test=length(set0)==1,yes=t(x2[set0,]),no=x2[set0,]));
#print(y1);
fuzhi
}
}
}
}
}
}
c501
{
x1
#计算当前层的误差
son_father(x1,y);
}
# c501(m3[,1:5],m3[,6])
w1=matrix(sign(rnorm(60000)),3000,20,1);
c501(w1[,1:19],w1[,20]);