该文转自http://blog.csdn.net/yangliuy/article/details/7322015
数据挖掘课上面老师介绍了下决策树ID3算法,我抽空余时间把这个算法用C++实现了一遍。
决策树算法是非常常用的分类算法,是逼近离散目标函数的方法,学习得到的函数以决策树的形式表示。其基本思路是不断选取产生信息增益最大的属性(对于属性选择有多种方法)来划分样例集合,构造决策树。信息增益定义为结点与其子结点的信息熵之和的差。信息熵是香农提出的,用于描述信息不纯度(不稳定性),其计算公式是
Pi为集合中不同性(而二元分类即正样例和负样例)的样例的比例。这样信息增益可以定义为样本按照某属性划分时造成熵减少的期望,可以区分训练样本中正负样本的能力,其计算公式是
我实现该算法针对的样例集合如下
该表记录了在不同气候条件下是否去打球的情况,要求根据该表用程序输出决策树
C++代码如下,程序中有详细注释
[cpp] view plain copy print ?
- #include
- #include
- #include
- #include
- #include
- #include
- using namespace std;
- #define MAXLEN 6//输入每行的数据个数
-
-
-
-
-
-
-
- vector > state;
- vector item(MAXLEN);
- vector attribute_row;
- string end("end");
- string yes("yes");
- string no("no");
- string blank("");
- map > map_attribute_values;
- int tree_size = 0;
- struct Node{
- string attribute;
- string arrived_value;
- vector childs;
- Node(){
- attribute = blank;
- arrived_value = blank;
- }
- };
- Node * root;
-
- void ComputeMapFrom2DVector(){
- unsigned int i,j,k;
- bool exited = false;
- vector values;
- for(i = 1; i < MAXLEN-1; i++){
- for (j = 1; j < state.size(); j++){
- for (k = 0; k < values.size(); k++){
- if(!values[k].compare(state[j][i])) exited =true;
- }
- if(!exited){
- values.push_back(state[j][i]);
- }
- exited = false;
- }
- map_attribute_values[state[0][i]] = values;
- values.erase(values.begin(), values.end());
- }
- }
-
- double ComputeEntropy(vector > remain_state, string attribute, string value,bool ifparent){
- vector<int> count (2,0);
- unsigned int i,j;
- bool done_flag = false;
- for(j = 1; j < MAXLEN; j++){
- if(done_flag) break;
- if(!attribute_row[j].compare(attribute)){
- for(i = 1; i < remain_state.size(); i++){
- if((!ifparent&&!remain_state[i][j].compare(value)) || ifparent){
- if(!remain_state[i][MAXLEN - 1].compare(yes)){
- count[0]++;
- }
- else count[1]++;
- }
- }
- done_flag = true;
- }
- }
- if(count[0] == 0 || count[1] == 0 )return 0;
- double sum = count[0] + count[1];
- double entropy = -count[0]/sum*log(count[0]/sum)/log(2.0) - count[1]/sum*log(count[1]/sum)/log(2.0);
- return entropy;
- }
-
- double ComputeGain(vector > remain_state, string attribute){
- unsigned int j,k,m;
-
- double parent_entropy = ComputeEntropy(remain_state, attribute, blank,true);
- double children_entropy = 0;
-
- vector values = map_attribute_values[attribute];
- vector<double> ratio;
- vector<int> count_values;
- int tempint;
- for(m = 0; m < values.size(); m++){
- tempint = 0;
- for(k = 1; k < MAXLEN - 1; k++){
- if(!attribute_row[k].compare(attribute)){
- for(j = 1; j < remain_state.size(); j++){
- if(!remain_state[j][k].compare(values[m])){
- tempint++;
- }
- }
- }
- }
- count_values.push_back(tempint);
- }
- for(j = 0; j < values.size(); j++){
- ratio.push_back((double)count_values[j] / (double)(remain_state.size()-1));
- }
- double temp_entropy;
- for(j = 0; j < values.size(); j++){
- temp_entropy = ComputeEntropy(remain_state, attribute, values[j], false);
- children_entropy += ratio[j] * temp_entropy;
- }
- return (parent_entropy - children_entropy);
- }
- int FindAttriNumByName(string attri){
- for(int i = 0; i < MAXLEN; i++){
- if(!state[0][i].compare(attri)) return i;
- }
- cerr<<"can't find the numth of attribute"<
- return 0;
- }
-
- string MostCommonLabel(vector > remain_state){
- int p = 0, n = 0;
- for(unsigned i = 0; i < remain_state.size(); i++){
- if(!remain_state[i][MAXLEN-1].compare(yes)) p++;
- else n++;
- }
- if(p >= n) return yes;
- else return no;
- }
-
- bool AllTheSameLabel(vector > remain_state, string label){
- int count = 0;
- for(unsigned int i = 0; i < remain_state.size(); i++){
- if(!remain_state[i][MAXLEN-1].compare(label)) count++;
- }
- if(count == remain_state.size()-1)returntrue;
- else returnfalse;
- }
-
-
-
-
-
- Node * BulidDecisionTreeDFS(Node * p, vector > remain_state, vector remain_attribute){
-
-
- if (p == NULL)
- p = new Node();
-
- if (AllTheSameLabel(remain_state, yes)){
- p->attribute = yes;
- return p;
- }
- if (AllTheSameLabel(remain_state, no)){
- p->attribute = no;
- return p;
- }
- if(remain_attribute.size() == 0){
- string label = MostCommonLabel(remain_state);
- p->attribute = label;
- return p;
- }
- double max_gain = 0, temp_gain;
- vector ::iterator max_it = remain_attribute.begin();
- vector ::iterator it1;
- for(it1 = remain_attribute.begin(); it1 < remain_attribute.end(); it1++){
- temp_gain = ComputeGain(remain_state, (*it1));
- if(temp_gain > max_gain) {
- max_gain = temp_gain;
- max_it = it1;
- }
- }
- vector new_attribute;
- vector > new_state;
- for(vector ::iterator it2 = remain_attribute.begin(); it2 < remain_attribute.end(); it2++){
- if((*it2).compare(*max_it)) new_attribute.push_back(*it2);
- }
-
- p->attribute = *max_it;
- vector values = map_attribute_values[*max_it];
- int attribue_num = FindAttriNumByName(*max_it);
- new_state.push_back(attribute_row);
- for(vector ::iterator it3 = values.begin(); it3 < values.end(); it3++){
- for(unsigned int i = 1; i < remain_state.size(); i++){
- if(!remain_state[i][attribue_num].compare(*it3)){
- new_state.push_back(remain_state[i]);
- }
- }
- Node * new_node = new Node();
- new_node->arrived_value = *it3;
- if(new_state.size() == 0){
- new_node->attribute = MostCommonLabel(remain_state);
- }
- else
- BulidDecisionTreeDFS(new_node, new_state, new_attribute);
- p->childs.push_back(new_node);
- new_state.erase(new_state.begin()+1,new_state.end());
- }
- return p;
- }
- void Input(){
- string s;
- while(cin>>s,s.compare(end) != 0){
- item[0] = s;
- for(int i = 1;i < MAXLEN; i++){
- cin>>item[i];
- }
- state.push_back(item);
- }
- for(int j = 0; j < MAXLEN; j++){
- attribute_row.push_back(state[0][j]);
- }
- }
- void PrintTree(Node *p, int depth){
- for (int i = 0; i < depth; i++) cout <<'\t';
- if(!p->arrived_value.empty()){
- cout<arrived_value<
- for (int i = 0; i < depth+1; i++) cout <<'\t';
- }
- cout<attribute<
- for (vector::iterator it = p->childs.begin(); it != p->childs.end(); it++){
- PrintTree(*it, depth + 1);
- }
- }
- void FreeTree(Node *p){
- if (p == NULL)
- return;
- for (vector::iterator it = p->childs.begin(); it != p->childs.end(); it++){
- FreeTree(*it);
- }
- delete p;
- tree_size++;
- }
- int main(){
- Input();
- vector remain_attribute;
- string outlook("Outlook");
- string Temperature("Temperature");
- string Humidity("Humidity");
- string Wind("Wind");
- remain_attribute.push_back(outlook);
- remain_attribute.push_back(Temperature);
- remain_attribute.push_back(Humidity);
- remain_attribute.push_back(Wind);
- vector > remain_state;
- for(unsigned int i = 0; i < state.size(); i++){
- remain_state.push_back(state[i]);
- }
- ComputeMapFrom2DVector();
- root = BulidDecisionTreeDFS(root,remain_state,remain_attribute);
- cout<<"the decision tree is :"<
- PrintTree(root,0);
- FreeTree(root);
- cout<
- cout<<"tree_size:"<
- return 0;
- }
#include
#include
#include
#include
输入的训练数据如下
[plain] view plain copy print ?
- Day Outlook Temperature Humidity Wind PlayTennis
- 1 Sunny Hot High Weak no
- 2 Sunny Hot High Strong no
- 3 Overcast Hot High Weak yes
- 4 Rainy Mild High Weak yes
- 5 Rainy Cool Normal Weak yes
- 6 Rainy Cool Normal Strong no
- 7 Overcast Cool Normal Strong yes
- 8 Sunny Mild High Weak no
- 9 Sunny Cool Normal Weak yes
- 10 Rainy Mild Normal Weak yes
- 11 Sunny Mild Normal Strong yes
- 12 Overcast Mild High Strong yes
- 13 Overcast Hot Normal Weak yes
- 14 Rainy Mild High Strong no
- end
Day Outlook Temperature Humidity Wind PlayTennis
1 Sunny Hot High Weak no
2 Sunny Hot High Strong no
3 Overcast Hot High Weak yes
4 Rainy Mild High Weak yes
5 Rainy Cool Normal Weak yes
6 Rainy Cool Normal Strong no
7 Overcast Cool Normal Strong yes
8 Sunny Mild High Weak no
9 Sunny Cool Normal Weak yes
10 Rainy Mild Normal Weak yes
11 Sunny Mild Normal Strong yes
12 Overcast Mild High Strong yes
13 Overcast Hot Normal Weak yes
14 Rainy Mild High Strong no
end
程序输出决策树如下
可以用图形表示为
有了决策树后,就可以根据气候条件做预测了
例如如果气候数据是{Sunny,Cool,Normal,Strong} ,根据决策树到左侧的yes叶节点,可以判定会去游泳。
另外在编写这个程序时在数据结构的设计上面走了弯路,多叉树的实现有很多方法,本算法采用每个结点的所有孩子用vector保存比较合适,同时注意维护剩余样例和剩余属性信息,建树时横向遍历靠循环属性的值,纵向遍历靠递归调用 ,总体是DFS,树和图的遍历在编程时经常遇到,得熟练掌握。程序有些地方的效率还得优化,有不足的点地方还望大家拍砖。