机器学习聚类算法AGNES层次聚类 C++实现

C++实现AGNES算法

该算法是自底而上的算法,他的主要原理是所有数据每个样本看成一个初始聚类,在算法运行过程中不断找出距离自己最近的聚类之后合并,直到达到自己设定的聚类个数,所用数据集为python的鸢尾花数据,伪代码如下:
机器学习聚类算法AGNES层次聚类 C++实现_第1张图片

C++代码如下

//DataPoint.h 保存基础数据的类
#ifndef _DATAPOINT_H_
#define  _DATAPOINT_H_

#include 
#include 
#include 
#include
using namespace std;
class DataPoint
{
public:
	DataPoint() {}
	~DataPoint() {}

	vector<double> GetData(); //获取保存数据
	string  GetName(); //获取花卉名字
	int GetClusterId(); //获取聚类id

	void SetClusterId(int number);//设置聚类id
	void SetName(string str);//设置花卉名字
	void SetData(vector<double> v);//设置保存数据
private:

	vector<double> Data; //保存数据
	string name; //花卉名字
	int clusterId;//聚类id

};
#endif // _DATAPOINT_H_
//DataPoint.cpp
#include "DataPoint.h"
void DataPoint::SetClusterId(int number)
{
	this->clusterId = number;
}
void DataPoint::SetData(vector<double> v)
{
	for (int i = 0; i < v.size(); i++)
	{
		Data.push_back(v[i]);
	}
}
void DataPoint::SetName(string name)
{
	this->name = name;
}
vector<double> DataPoint::GetData()
{
	return this->Data;
}
string DataPoint::GetName()
{
	return this->name;
}
int DataPoint::GetClusterId()
{
	return this->clusterId;
}
//AGNES.cpp
#ifndef _AGNES_H_
#define  _AGNES_H_

#include 
#include
#include
#include 
#include 
#include "DataPoint.h"
using namespace std;
//将string数据转为数字
template <class Type>
Type stringToNum(const string& str)
{
	istringstream iss(str);
	Type num;
	iss >> num;
	return num;
}
class AGNES
{
public:
	AGNES(double ThreshordDis, int MaxClu) :ThreshordDis(ThreshordDis), MaxClu(MaxClu) {}
	~AGNES() {}
	void StartAgnes(); //算法核心
	void GetData();//获取数据
	void print();//打印信息
private:
	double GetDistance(DataPoint &point, DataPoint &point2); //得到两个数据之间的距离
	double GetCDistance(vector<DataPoint> C, vector<DataPoint> C2); //得到两个聚类之间的最小距离
	double GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2);//得到两个聚类之间的平均距离
	vector<DataPoint> DataBase; //保存所有数据
	vector<vector<DataPoint>> CluData; //保存所有簇类数据


	double ThreshordDis; //这里暂时无用,可以省略
	int MaxClu; //最终获得聚类数量
	int  p; //存储最初的聚类的个数
};
#endif
//AGNES.cpp
#include "AGNES.h"
static double DataMap[10000][10000]; //保存两两聚类之间的距离
void AGNES::GetData()
{
	ifstream file;
	string line;
	file.open("iris.csv",ios::in);
	if (file.fail())
	{
		cout << "文件打开失败" << endl;
	}
	while (getline(file, line))
	{
		stringstream ss(line);
		string str;
		vector<string> temp;
		vector<double> v;
		DataPoint d;
		while (getline(ss, str, ','))
		{
			temp.push_back(str);
		}
		for (int i = 1; i < temp.size()-1; i++)
		{
			v.push_back(stringToNum<double>(temp[i]));
		}
		//初始化数据
		d.SetData(v);
		d.SetName(temp[temp.size() - 1]); 
		d.SetClusterId(stringToNum<int>(temp[0])-1);//因为数据是从1开始编号的所以这里-1使其从0开始编号
		DataBase.push_back(d);
	}
}

void AGNES::StartAgnes()
{
	//获得每个数据距离其他数据的距离
	for (int i = 0; i < DataBase.size(); i++)
	{
		vector<DataPoint> t;
		t.push_back(DataBase[i]);
		CluData.push_back(t);
	}
	//DataMap数组保存两个聚类之间的距离
	for (int i = 0; i < CluData.size(); i++)
	{
		vector<DataPoint> temp;
		for (int j =i+1; j < CluData.size(); j++)
		{
			double dis = GetAVGDistance(CluData[i], CluData[j]);
			DataMap[i][j] = dis;
			DataMap[j][i] = dis;
		}
	}
	p = DataBase.size(); //设置聚类个数
	while (p > MaxClu)
	{
		double Temp = 9999;
		int Find_i = 0, Find_j = 0;
		//寻找最小值点
		for (int i = 0; i < p; i++)
		{
			for (int j = i+1; j < p; j++)
			{
				if (DataMap[i][j] < Temp)
				{
					Temp = DataMap[i][j];
					Find_i = i;
					Find_j = j;
				}
			}
		}
		int NewId = CluData[Find_i][0].GetClusterId(); //获取当前簇的聚类号码
		//将寻找到的最小点的两个簇合并 A1+B1->A1
		//并将两个簇的簇类号码统一
		for (int j = 0; j < CluData[Find_j].size(); j++)
		{
			CluData[Find_j][j].SetClusterId(NewId);
			CluData[Find_i].push_back(CluData[Find_j][j]);
		}
		//重编号矩阵
		for (int i = Find_j + 1; i < p; i++)
		{
			for (int j = 0; j < CluData[i].size(); j++)
			{
				CluData[i][j].SetClusterId(CluData[i][j].GetClusterId() - 1);
			}
		}
		//距离矩阵重置
		for (int i = Find_j; i < CluData.size()-1; i++)
		{
			CluData[i] = CluData[i + 1];
		}
		//删除DataMap矩阵 行列
		for (int i = 0; i < p; i++)
		{
			for (int j = Find_j; j < p-1; j++)
			{
				DataMap[i][j] = DataMap[i][j + 1];
			}
		}
		for (int i = Find_j; i < p; i++)
		{
			for (int j = 0; j < p - 1; j++)
			{
				DataMap[i][j] = DataMap[i + 1][j];
			}
		}
		p = p - 1;
		//重新计算合并数据后的聚类与其他聚类之间的距离
		for (int i = 0; i < p; ++i)
		{
			double dis = GetAVGDistance(CluData[Find_i], CluData[i]);
			if (DataMap[Find_i][i] != dis)
			{
				DataMap[Find_i][i] = dis;
				DataMap[i][Find_i] = dis;
			}
		}
	}
}
//打印数据
void AGNES::print()
{
	for (int i = 0; i < p; i++)
	{
		cout << "聚类编号:" << i << endl;
		for (int j = 0; j < CluData[i].size(); j++)
		{
			vector<double> dat = CluData[i][j].GetData();
			for (int s = 0; s < dat.size(); s++)
			{
				cout << dat[s] << " ";
			}
			cout << CluData[i][j].GetName() << " ";
			cout << CluData[i][j].GetClusterId();
			cout << endl;
		}
	}
}
//两个数据之间的距离
double AGNES::GetDistance(DataPoint &point, DataPoint &point2)
{
	double sum = 0;
	for (int i = 0; i < point.GetData().size(); i++)
	{
		sum += (point.GetData()[i] - point2.GetData()[i])*(point.GetData()[i] - point2.GetData()[i]);
	}
	double result = sqrt(sum);
	return result;
}
//两个聚类之间最小的距离
double AGNES::GetCDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
	double MinDis = 9999;
	for (int i = 0; i < C.size(); i++)
	{
		for (int j = 0; j < C2.size(); j++)
		{
			double dis = GetDistance(C[i], C2[j]);
			if (dis < MinDis)
			{
				MinDis = dis;
			}
		}
	}
	return MinDis;
}
//两个聚类的平均距离
double AGNES::GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
	double temp[4];
	double temp2[4];
	for (int i = 0; i < C.size(); i++)
	{
		temp[0] += C[i].GetData()[0];
		temp[1] += C[i].GetData()[1];
		temp[2] += C[i].GetData()[2];
		temp[3] += C[i].GetData()[3];
	}
	temp[0] = temp[0] / C.size();
	temp[1] = temp[1] / C.size();
	temp[2] = temp[2] / C.size();
	temp[3] = temp[3] / C.size();

	for (int i = 0; i < C2.size(); i++)
	{
		temp2[0] += C2[i].GetData()[0];
		temp2[1] += C2[i].GetData()[1];
		temp2[2] += C2[i].GetData()[2];
		temp2[3] += C2[i].GetData()[3];
	}
	temp2[0] = temp2[0] / C2.size();
	temp2[1] = temp2[1] / C2.size();
	temp2[2] = temp2[2] / C2.size();
	temp2[3] = temp2[3] / C2.size();
	int sum = 0;
	for (int i = 0; i < 4; i++)
	{
		sum += ((temp[i] - temp2[i])*(temp[i] - temp2[i]));
	}
	return sqrt(sum);
}

你可能感兴趣的:(机器学习,聚类算法,c++,算法)