决策树进行多属性分类 C++实现

#include"stdafx.h"
#include "stdlib.h"
#include
#include
#include
#include
#include
#include
#include 
using namespace std;
#define MAXLEN 6//定义的属性个数 
ifstream fin("data.txt");
ofstream fout("result.txt");

//自定义的决策树节点
struct Node
{
	string attribute;//属性值
	string arrived_value;//到达的属性值
	vector childs;//所有的分差节点,vector装入 
	Node()
	{
		// 初始化的时候均设为空值 
		attribute = "";
		arrived_value = "";
	}
}*root;

vector  > state;//保存整个实例集
vector  item(MAXLEN);//对应实例集中的某一行
vector  attribute_row;//保存首行即属性行数据
string attrName[MAXLEN] = { "RID", "model", "perfume", "price", "income", "buyaction" };
map > map_attribute_values;//存储属性对应的所有的值
int tree_size = 0;

// 将数据以文件流的形式输入 
void DataInit()
{

	string s;
	int i, j;
	while (fin >> s, s.compare("end") != 0)
	{
		//当输入结束时s.compare("end")的值为-1 
		item[0] = s;	//第一个值保存的是RID 
		for (int i = 1; i < MAXLEN; i++)
		{
			fin >> item[i];
		}
		state.push_back(item);
	}
	for (j = 0; j < MAXLEN; j++)
	{
		attribute_row.push_back(attrName[j]);	// 初始化属性栏值 
	}
	fin.close();
}

// log2为底通过换底公式换成自然数底数
double lg2(double n)
{
	return log(n) / log(2);
}

//根据数据实例存储属性对应的所有的值
void MapAttributeValue()
{
	unsigned int i, j, k;
	bool exited = false;
	vector values;
	//按照列遍历(每个属性对应一列)
	for (i = 1; i < MAXLEN - 1; i++)
	{
		for (j = 0; j < state.size(); j++)
		{
			for (k = 0; k < values.size(); k++)
			{

				//如果该属性值已经存在则退出循环 
				if (!values[k].compare(state[j][i]))
				{
					exited = true;
					break;
				}
			}
			//如果循环后的exited值仍为false说明该属性值并未出现过
			//将其加入到暂时存放的向量中 
			if (!exited)
			{
				values.push_back(state[j][i]);
			}
			exited = false;	//注意此处重置值 
		}
		map_attribute_values[attrName[i]] = values;	//相对应的放入 

		values.clear();	//为了存放下一次的属性值清空 
	}
}

//根据具体属性和值来计算熵
double Entropy(vector  > remain_state, string attribute, string value, bool ifparent)
{
	vector count(2, 0);
	int i, j;
	bool flag = false;
	for (j = 1; j < MAXLEN; j++)
	{
		if (flag) break;
		if (!attribute_row[j].compare(attribute))
		{
			for (i = 1; i < remain_state.size(); i++)
			{
				if ((!ifparent&&!remain_state[i][j].compare(value)) || ifparent)
				{//ifparent记录是否算父节点
					if (!remain_state[i][MAXLEN - 1].compare("yes"))
					{
						count[0]++;	//count[0]记录实例中yes的值 
					}
					else count[1]++;//count[1]记录实例中no的值 
				}
			}
			flag = true;
		}
	}
	//全部是正实例或者负实例
	if (count[0] == 0 || count[1] == 0) return 0;
	//具体计算熵
	double sum = count[0] + count[1];
	double entropy = -count[0] / sum * lg2((double)count[0] / sum)
		- count[1] / sum * lg2((double)count[1] / sum);
	return entropy;
}

//计算按照属性attribute划分当前剩余实例的信息增益
double Gain(vector  > remain_state, string attribute)
{
	unsigned int j, k, m;
	//首先求不做划分时的熵,即根的熵值 
	double parent_entropy = Entropy(remain_state, attribute, "", true);
	double children_entropy = 0;
	//然后求做划分后各个值的熵
	vector values = map_attribute_values[attribute];
	vector ratio;//存放每一个属性值在实例中的期望 
	vector count_values;
	int tempint;//存放出现的次数 
	for (m = 0; m < values.size(); m++)
	{
		tempint = 0;
		for (k = 1; k < MAXLEN - 1; k++)
		{
			if (!attribute_row[k].compare(attribute))
			{
				for (j = 1; j < remain_state.size(); j++)
				{
					if (!remain_state[j][k].compare(values[m]))
					{
						tempint++;
					}
				}
			}
		}
		count_values.push_back(tempint);
	}

	for (j = 0; j < values.size(); j++)
	{
		ratio.push_back((double)count_values[j] / (double)(remain_state.size() - 1));
	}
	double temp_entropy;
	for (j = 0; j < values.size(); j++)
	{
		temp_entropy = Entropy(remain_state, attribute, values[j], false);
		children_entropy += ratio[j] * temp_entropy;	//权值的累加 
	}
	return (parent_entropy - children_entropy);
}

//根据属性名查找到对应的AttriNum 
int FindAttriNumByName(string attri)
{
	for (int i = 0; i < MAXLEN; i++)
	{
		if (!attrName[i].compare(attri)) return i;
	}
	cerr << "can't find the numth of attribute" << endl;
	return 0;
}

//找出样例中占多数的正/负性
string MostCommonLabel(vector  > remain_state)
{
	int y = 0, n = 0;
	int i;
	for (i = 0; i < remain_state.size(); i++)
	{
		if (!remain_state[i][MAXLEN - 1].compare("yes")) y++;
		else n++;
	}
	if (y >= n) return "yes";
	else return "no";
}

//若子集只含有单个属性,则分支为叶子节点
//判断样例是否正负性都一致,即是否到达叶子节点 
bool IsReachLeaveNode(vector  > remain_state, string label)
{
	int count = 0, i;
	for (i = 0; i < remain_state.size(); i++)
	{
		if (!remain_state[i][MAXLEN - 1].compare(label)) count++;
	}
	if (count == remain_state.size() - 1) return true;
	else return false;
}

//生成决策树的方法 
Node * BulidDecisionTree(Node * p, vector  > remain_state,
	vector  remain_attribute)
{

	int i;

	if (p == NULL)
		p = new Node();

	//先看搜索到树叶的情况
	if (IsReachLeaveNode(remain_state, "yes"))
	{
		p->attribute = "yes";
		return p;
	}
	if (IsReachLeaveNode(remain_state, "no"))
	{
		p->attribute = "no";
		return p;
	}
	//所有的属性均已经考虑完了,还没有分尽
	if (remain_attribute.size() == 0)
	{
		string label = MostCommonLabel(remain_state);
		p->attribute = label;
		return p;
	}

	double max_gain = 0;	//记录最大的熵值 
	double temp_gain;

	//信息增益算出来不一定大于0
	//max_it应该初始化为remain_attribute.begin()
	vector ::iterator max_it = remain_attribute.begin();
	vector ::iterator ittmp = remain_attribute.begin();
	for (; ittmp < remain_attribute.end(); ittmp++)
	{
		temp_gain = Gain(remain_state, (*ittmp));	//求出指针对应属性的熵值 
		if (temp_gain > max_gain)
		{//若找到更佳的属性则转化 
			max_gain = temp_gain;
			max_it = ittmp;
		}
	}

	//下面根据max_it指向的属性来划分当前样例,更新样例集和属性集
	vector  new_attribute;//保存去除最佳属性后的剩余属性 
	vector  > new_state;//保存剩余的实例 
	vector ::iterator it2 = remain_attribute.begin();
	for (; it2 < remain_attribute.end(); it2++)
	{
		if ((*it2).compare(*max_it)) new_attribute.push_back(*it2);
	}

	//确定了最佳划分属性,注意保存
	p->attribute = *max_it;
	vector  values; //存放最佳属性的属性值 
	values = map_attribute_values[*max_it];
	int attribue_num = FindAttriNumByName(*max_it);
	new_state.push_back(attribute_row);//将第一行的属性先包括 
	vector ::iterator it3 = values.begin();
	for (; it3 < values.end(); it3++)
	{
		for (i = 1; i < remain_state.size(); i++)
		{
			if (!remain_state[i][attribue_num].compare(*it3))
			{
				new_state.push_back(remain_state[i]);
			}
		}
		Node * new_node = new Node();
		new_node->arrived_value = *it3;
		if (new_state.size() == 0)
		{//表示当前没有这个分支的样例,当前的new_node为叶子节点
			new_node->attribute = MostCommonLabel(remain_state);
		}
		else
			BulidDecisionTree(new_node, new_state, new_attribute);

		p->childs.push_back(new_node);//将新结点加入父节点孩子容器
		//注意先清空new_state中的前一个取值的样例,准备遍历下一个取值样例
		new_state.erase(new_state.begin() + 1, new_state.end());
	}
	return p;
}

//打印树的函数 
void PrintTree(Node *p, int depth)
{
	for (int i = 0; i < depth; i++)
	{
		fout << '\t';//按照树的深度先输出tab
		cout << '\t';//按照树的深度先输出tab
	}
	if (!p->arrived_value.empty())
	{
		fout << p->arrived_value << endl;
		cout << p->arrived_value << endl;
		for (int i = 0; i < depth + 1; i++)
		{
			fout << '\t';//按照树的深度先输出tab
			cout << '\t';//按照树的深度先输出tab
		}
	}
	fout << p->attribute << endl;
	cout << p->attribute << endl;
	for (vector::iterator it = p->childs.begin(); it != p->childs.end(); it++)
	{
		PrintTree(*it, depth + 1);	//递归 
	}
}

//对单条数据,输出最后的结果 
string TestClassifiter(string test[])
{
	string ans = "";
	Node* arrived_attibute = root;	//初始化为root 
	bool isFind;
	while (true)
	{
		isFind = false;
		if (arrived_attibute->childs.size() == 1) break;	//此时说明已经达到叶子节点 
		vector::iterator it = arrived_attibute->childs.begin();
		//循环扫描分叉中的值 
		for (; it != arrived_attibute->childs.end(); it++)
		{
			for (int k = 0; k < 4; k++)
			{
				if ((*it)->arrived_value == test[k])
				{
					isFind = true;
					break;
				}
			}
			if (isFind) break;
		}
		if (!isFind)	break;
		arrived_attibute = *it;
	}
	//叶子节点输出
	return arrived_attibute->attribute;
}

int main()
{

	int i;
	DataInit();	//数据初始化 

	//remain_attribute存放需要划分的属性(一开始是所有属性) 
	vector  remain_attribute;
	for (i = 1; i < MAXLEN - 1; i++)
		remain_attribute.push_back(attrName[i]);

	//remain_state存放实例集(一开始是整个实例集)	
	vector  > remain_state;
	for (unsigned int i = 0; i < state.size(); i++)
	{
		remain_state.push_back(state[i]);
	}

	MapAttributeValue();//将属性值映射到对应的属性 

	root = BulidDecisionTree(root, remain_state, remain_attribute);
	fout << "the decision tree is :" << endl;
	PrintTree(root, 0);


	//以下是数据测试 
	string test[] = { "small", "strong", "expensive", "low" };
	cout << "the test result is: " << TestClassifiter(test) << endl;
	system("pause");
	return 0;
}


data数据如下:

1 small strong expensive high no
2 small strong expensive a_bit_high no
3 small strong expensive common yes
4 small strong expensive low no
5 small strong a_litle_expensive high yes
6 small strong a_litle_expensive a_bit_high no
7 small strong a_litle_expensive common yes
8 small strong a_litle_expensive low no
9 small strong affordable high yes
10 small strong affordable a_bit_high yes
11 small strong affordable common yes
12 small strong affordable low no
13 small strong substantial high yes
14 small strong substantial a_bit_high no
15 small strong substantial common no
16 small strong substantial low no
17 small strong cheap high no
18 small strong cheap a_bit_high no
19 small strong cheap common no
20 small strong cheap low no
21 small elegant expensive high no
22 small elegant expensive a_bit_high no
23 small elegant expensive common yes
24 small elegant expensive low no
25 small elegant a_litle_expensive high no
26 small elegant a_litle_expensive a_bit_high yes
27 small elegant a_litle_expensive common no
28 small elegant a_litle_expensive low no
29 small elegant affordable high yes
30 small elegant affordable a_bit_high yes
31 small elegant affordable common no
32 small elegant affordable low no
33 small elegant substantial high yes
34 small elegant substantial a_bit_high yes
35 small elegant substantial common no
36 small elegant substantial low no
37 small elegant cheap high yes
38 small elegant cheap a_bit_high yes
39 small elegant cheap common no
40 small elegant cheap low no
41 small slight expensive high no
42 small slight expensive a_bit_high yes
43 small slight expensive common no
44 small slight expensive low no
45 small slight a_litle_expensive high yes
46 small slight a_litle_expensive a_bit_high no
47 small slight a_litle_expensive common yes
48 small slight a_litle_expensive low no
49 small slight affordable high no
50 small slight affordable a_bit_high no
51 small slight affordable common no
52 small slight affordable low no
53 small slight substantial high yes
54 small slight substantial a_bit_high no
55 small slight substantial common no
56 small slight substantial low no
57 small slight cheap high yes
58 small slight cheap a_bit_high no
59 small slight cheap common no
60 small slight cheap low no
61 smart strong expensive high yes
62 smart strong expensive a_bit_high no
63 smart strong expensive common no
64 smart strong expensive low no
65 smart strong a_litle_expensive high no
66 smart strong a_litle_expensive a_bit_high yes
67 smart strong a_litle_expensive common no
68 smart strong a_litle_expensive low no
69 smart strong affordable high yes
70 smart strong affordable a_bit_high yes
71 smart strong affordable common no
72 smart strong affordable low no
73 smart strong substantial high no
74 smart strong substantial a_bit_high no
75 smart strong substantial common no
76 smart strong substantial low no
77 smart strong cheap high no
78 smart strong cheap a_bit_high no
79 smart strong cheap common yes
80 smart strong cheap low no
81 smart elegant expensive high yes
82 smart elegant expensive a_bit_high no
83 smart elegant expensive common yes
84 smart elegant expensive low no
85 smart elegant a_litle_expensive high yes
86 smart elegant a_litle_expensive a_bit_high yes
87 smart elegant a_litle_expensive common yes
88 smart elegant a_litle_expensive low no
89 smart elegant affordable high yes
90 smart elegant affordable a_bit_high yes
91 smart elegant affordable common yes
92 smart elegant affordable low no
93 smart elegant substantial high yes
94 smart elegant substantial a_bit_high yes
95 smart elegant substantial common no
96 smart elegant substantial low no
97 smart elegant cheap high yes
98 smart elegant cheap a_bit_high no
99 smart elegant cheap common yes
100 smart elegant cheap low no
101 smart slight expensive high yes
102 smart slight expensive a_bit_high yes
103 smart slight expensive common no
104 smart slight expensive low no
105 smart slight a_litle_expensive high no
106 smart slight a_litle_expensive a_bit_high yes
107 smart slight a_litle_expensive common no
108 smart slight a_litle_expensive low no
109 smart slight affordable high no
110 smart slight affordable a_bit_high yes
111 smart slight affordable common yes
112 smart slight affordable low no
113 smart slight substantial high yes
114 smart slight substantial a_bit_high yes
115 smart slight substantial common yes
116 smart slight substantial low no
117 smart slight cheap high yes
118 smart slight cheap a_bit_high no
119 smart slight cheap common yes
120 smart slight cheap low no
121 middle strong expensive high yes
122 middle strong expensive a_bit_high no
123 middle strong expensive common yes
124 middle strong expensive low no
125 middle strong a_litle_expensive high no
126 middle strong a_litle_expensive a_bit_high no
127 middle strong a_litle_expensive common yes
128 middle strong a_litle_expensive low no
129 middle strong affordable high yes
130 middle strong affordable a_bit_high yes
131 middle strong affordable common no
132 middle strong affordable low no
133 middle strong substantial high no
134 middle strong substantial a_bit_high no
135 middle strong substantial common no
136 middle strong substantial low no
137 middle strong cheap high no
138 middle strong cheap a_bit_high yes
139 middle strong cheap common yes
140 middle strong cheap low no
141 middle elegant expensive high yes
142 middle elegant expensive a_bit_high no
143 middle elegant expensive common yes
144 middle elegant expensive low no
145 middle elegant a_litle_expensive high no
146 middle elegant a_litle_expensive a_bit_high yes
147 middle elegant a_litle_expensive common no
148 middle elegant a_litle_expensive low no
149 middle elegant affordable high no
150 middle elegant affordable a_bit_high yes
151 middle elegant affordable common yes
152 middle elegant affordable low no
153 middle elegant substantial high yes
154 middle elegant substantial a_bit_high no
155 middle elegant substantial common no
156 middle elegant substantial low no
157 middle elegant cheap high no
158 middle elegant cheap a_bit_high no
159 middle elegant cheap common yes
160 middle elegant cheap low no
161 middle slight expensive high yes
162 middle slight expensive a_bit_high yes
163 middle slight expensive common no
164 middle slight expensive low no
165 middle slight a_litle_expensive high yes
166 middle slight a_litle_expensive a_bit_high no
167 middle slight a_litle_expensive common no
168 middle slight a_litle_expensive low no
169 middle slight affordable high no
170 middle slight affordable a_bit_high yes
171 middle slight affordable common no
172 middle slight affordable low no
173 middle slight substantial high yes
174 middle slight substantial a_bit_high yes
175 middle slight substantial common yes
176 middle slight substantial low no
177 middle slight cheap high no
178 middle slight cheap a_bit_high yes
179 middle slight cheap common yes
180 middle slight cheap low no
181 large strong expensive high yes
182 large strong expensive a_bit_high no
183 large strong expensive common no
184 large strong expensive low no
185 large strong a_litle_expensive high no
186 large strong a_litle_expensive a_bit_high no
187 large strong a_litle_expensive common no
188 large strong a_litle_expensive low no
189 large strong affordable high no
190 large strong affordable a_bit_high no
191 large strong affordable common no
192 large strong affordable low no
193 large strong substantial high no
194 large strong substantial a_bit_high no
195 large strong substantial common no
196 large strong substantial low no
197 large strong cheap high yes
198 large strong cheap a_bit_high no
199 large strong cheap common yes
200 large strong cheap low no
201 large elegant expensive high yes
202 large elegant expensive a_bit_high yes
203 large elegant expensive common yes
204 large elegant expensive low no
205 large elegant a_litle_expensive high yes
206 large elegant a_litle_expensive a_bit_high no
207 large elegant a_litle_expensive common yes
208 large elegant a_litle_expensive low no
209 large elegant affordable high yes
210 large elegant affordable a_bit_high no
211 large elegant affordable common yes
212 large elegant affordable low no
213 large elegant substantial high no
214 large elegant substantial a_bit_high yes
215 large elegant substantial common no
216 large elegant substantial low no
217 large elegant cheap high yes
218 large elegant cheap a_bit_high no
219 large elegant cheap common yes
220 large elegant cheap low no
221 large slight expensive high no
222 large slight expensive a_bit_high no
223 large slight expensive common no
224 large slight expensive low no
225 large slight a_litle_expensive high yes
226 large slight a_litle_expensive a_bit_high yes
227 large slight a_litle_expensive common no
228 large slight a_litle_expensive low no
229 large slight affordable high yes
230 large slight affordable a_bit_high yes
231 large slight affordable common yes
232 large slight affordable low no
233 large slight substantial high yes
234 large slight substantial a_bit_high no
235 large slight substantial common yes
236 large slight substantial low no
237 large slight cheap high no
238 large slight cheap a_bit_high yes
239 large slight cheap common yes
240 large slight cheap low no
end

你可能感兴趣的:(机器学习)