基于密度的聚类算法C语言实现--DBSCAN

#include
#include
#include
#include
#include

//#define INITIALASSIGN_COREOBJECT		100
//#define INCREASEMENT_COREOBJECT		100		
#define INITIALASSIGN_DIRECTLYDENSITYREACHABLE	100
#define INCREASEMENT_DIRECTLYDENSITYREACHABLE 	10

double neighborhood;
int MinPts;
char filename[200];
int data_size;

int size_of_core_object;

typedef struct Point
{
	double x;
	double y;
}Point;
Point* point;

typedef struct CoreObject
{
	int coreObjectID;
	int* directlyDensityReachable;	//store the directly density_reachable point of corePointID
	int reachableSize;		//the number of directly density reachable
	int capacity;			//the current capacity of the dynamic array @directlyDensityReachable
}CoreObject;
CoreObject* coreObject_Collection;	//collectint the core_object
CoreObject* coreObject;			//collected core_object

//sequence queue
typedef struct QueueNode
{
	int data;
	struct QueueNode* next;
}QueueNode, *QueueNodePtr;
typedef struct LinkQueue
{
	QueueNodePtr front;
	QueueNodePtr rear;
}LinkQueue;

void initialQueue(LinkQueue*);
void insertQueue(LinkQueue*, int);
void deleteQueue(LinkQueue*, int*);
void printQueue(LinkQueue);
void testQueue();
int isEmptyQueue(LinkQueue);

//sequence queue END

void Init();
void ReadData();
double calculateDistance_BetweenTwo(int, int);
void calculateDistance_BetweenOneToAll(int);
void calculateDistance_BetweenAll();
void statisticCoreObject();
void showInformation();
void setCoreObject();
int* preparatory_DBSCAN();
void DBSCAN();
void refreshOld_unAccessed_Set(int*, int*);
int existCoreObject();
int getRandomCoreObject();
void addToQueue_baseCoreObject(LinkQueue*, int);
void updateUnaccessSet(int*, int);
void addToQueue_intersectionBased(LinkQueue*, int*, int);
void getCluster(int*, int*, int);
void updateCoreObject(int*);
void saveNoise(int*);

int main(int argc, char* argv[])
{
	if( argc != 5 )
	{
		printf("this program need 5 parematers to run,"
				"\n\t\tthe first to indicate the neighborhood"
				"\n\t\tthe second to indicate the MinPts"
				"\n\t\tthe third to indicate the filename contain data"
				"\n\t\tthe fourth to indicate the data size");
		exit(0);
	}
	srand((unsigned)time(NULL));
	neighborhood = atof(argv[1]);
	MinPts = atoi(argv[2]);
	strcat(filename, argv[3]);
	data_size = atoi(argv[4]);

	Init();
	ReadData();
	calculateDistance_BetweenAll();
	statisticCoreObject();
	//showInformation();
	setCoreObject();
	//testQueue();
	DBSCAN();
	return 0;
}

/*
 * initialization
 * */
void Init()
{
	point = (Point*)malloc(sizeof(struct Point) * (data_size + 1));
	if( !point )
	{
		printf("point malloc error");
		exit(0);
	}

	coreObject_Collection = (CoreObject*)malloc(sizeof(struct CoreObject) * (data_size + 1));
	if( !coreObject_Collection )
	{
		printf("coreObject_Collection malloc error!");
		exit(0);
	}
	int coreObject;			//traverse
	for( coreObject = 1; coreObject <= data_size; coreObject++ )
	{
		coreObject_Collection[coreObject].coreObjectID = 0;				//if the value equal 0 denote it's not core object
		coreObject_Collection[coreObject].reachableSize = 0;				//INITIALASSIGN_DIRECTLYDENSITYREACHABLE
		coreObject_Collection[coreObject].capacity = INITIALASSIGN_DIRECTLYDENSITYREACHABLE;
		coreObject_Collection[coreObject].directlyDensityReachable = (int*)malloc(sizeof(int) * (INITIALASSIGN_DIRECTLYDENSITYREACHABLE + 1));
		if( !coreObject_Collection[coreObject].directlyDensityReachable )
		{
			printf("coreObject_Collection malloc error: %d", coreObject);
			exit(0);
		}
	}

}

/*
 * read data from file;
 * 	set the value of point
 * */
void ReadData()
{
	FILE* fread;
	if( NULL == (fread = fopen(filename, "r")))
	{
		printf("open file(%s) error!", filename);
		exit(0);
	}
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		if( 2 != fscanf(fread, "%lf\t%lf", &point[i].x, &point[i].y))
		{
			printf("scanf error: %d", i);
			exit(0);
		}
	}
}

/*
 * calculate distance between two point
 * */
double calculateDistance_BetweenTwo(int firstPoint, int secondPoint)
{	
	double temp = sqrt( pow((double)(point[firstPoint].x - point[secondPoint].x), 2) + pow((double)(point[firstPoint].y - point[secondPoint].y), 2));
	return temp;
}

/*
 * calculate distance bewteen specifed point to all others points
 * and seek the directly_density_reachable of the specified point &pointID
 * */
void calculateDistance_BetweenOneToAll(int pointID)
{
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		if( i != pointID )
		{
			if( calculateDistance_BetweenTwo(pointID, i) <= neighborhood )
			{
				coreObject_Collection[pointID].reachableSize++;
				if( coreObject_Collection[pointID].reachableSize > coreObject_Collection[pointID].capacity )
				{
					printf("\nrealloc\n\n");
					coreObject_Collection[pointID].directlyDensityReachable = (int*)realloc(coreObject_Collection[pointID].directlyDensityReachable, sizeof(int) * (coreObject_Collection[pointID].capacity + INCREASEMENT_DIRECTLYDENSITYREACHABLE));
					if( !coreObject_Collection[pointID].directlyDensityReachable )
					{
						printf("coreObject_Collection[%d].directlyDensityReachable realloc error", i);
						exit(0);
					}
					coreObject_Collection[pointID].capacity += INCREASEMENT_DIRECTLYDENSITYREACHABLE;
				}
				coreObject_Collection[pointID].directlyDensityReachable[coreObject_Collection[pointID].reachableSize] = i;
			}
		}
	}
}

/*
 * calculate distance between all points
 * */
void calculateDistance_BetweenAll()
{
	int i;			//traverse all the data_size
	for( i = 1; i <= data_size; i++ )
	{
		calculateDistance_BetweenOneToAll(i);
	}
}

/*
 * specify the core object by statisticing the number of directly_density_reachable for all points
 * the value of coreObject in the struct of coreObject_Collection be used to denote whether or not a core object
 * */
void statisticCoreObject()
{
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		if( coreObject_Collection[i].reachableSize >= MinPts - 1 )			//core object
		{
			size_of_core_object++;
			coreObject_Collection[i].coreObjectID = i;			//ueing non_zero value to denote this point is a core_object
		}
	}
}

/*
 * show the struct of the directly_density_reachable of all coreObject
 * */
void showInformation()
{
	int direct_reachable;
	int coreObject;
	for( coreObject = 1; coreObject <= data_size ; coreObject++ )
	{
		printf("%d---", coreObject_Collection[coreObject].coreObjectID);
		for( direct_reachable = 1; direct_reachable <= coreObject_Collection[coreObject].reachableSize; direct_reachable++ )
		{
			printf("%d ", coreObject_Collection[coreObject].directlyDensityReachable[direct_reachable]);
		}
		printf("\n");
	}
}

/*
 * set the struct of @coreObject in term of the result of coreObject_Collection
 * */
void setCoreObject()
{
	coreObject = (CoreObject*)malloc(sizeof(struct CoreObject) * (size_of_core_object + 1));
	if( !coreObject )
	{
		printf("coreObject malloc error!");
		exit(0);
	}
	int i;
	int j;
	int count = 1;
	for( i = 1; i <= data_size; i++ )
	{
		if( coreObject_Collection[i].reachableSize >= MinPts - 1 )
		{
			coreObject[count].coreObjectID = i;
			coreObject[count].directlyDensityReachable = (int*)malloc(sizeof(int) * (coreObject_Collection[i].reachableSize + 1));
			if( !coreObject[count].directlyDensityReachable )
			{
				printf("coreObject[%d].directlyDensityReachable malloc error!");
				exit(0);
			}
			for( j = 1; j <= coreObject_Collection[i].reachableSize; j++ )
			{
				coreObject[count].directlyDensityReachable[j] = coreObject_Collection[i].directlyDensityReachable[j];
			}
			coreObject[count].capacity = 0;		//change its function to flag whether this core object has beed selected
			coreObject[count].reachableSize = coreObject_Collection[i].reachableSize;
			count++;
		}
	}
}

/*
 * some preparatory for the algorithem DBSCAN
 * 	create the set of Un-accessed data
 * */
int* preparatory_DBSCAN()
{
	//initial the Un-accessed data
	int* UnaccessedData;
	UnaccessedData = (int*)malloc(sizeof(int) * (data_size + 1));
	if( !UnaccessedData )
	{
		printf("UnaccessedData malloc error!");
		exit(0);
	}
	int i;
	for( i = 0; i <= data_size; i++ )
		UnaccessedData[i] = 0;			//0 denote haven't been visited
	//seek the noise
	for( i = 1; i <= data_size; i++ )
	{
		if( 0 == coreObject_Collection[i].reachableSize )
		{
			UnaccessedData[i] = -1;		//uses non-zero to denote the noise
		}
	}

	return UnaccessedData;
}

/********************************************************************************************************************
 ********************************************************************************************************************
 *
 * 							DBSCAN
 *
 ********************************************************************************************************************
 ********************************************************************************************************************/
void DBSCAN()
{
	int* un_accessed_data = preparatory_DBSCAN();
	int* old_unAccessedData;								//save the original information of un_accessed_data
	int i;
	old_unAccessedData = (int*)malloc(sizeof(int) * (data_size + 1));
	if( !old_unAccessedData )
	{
		printf("old_unAccessedData malloc error!");
		exit(0);
	}
	for( i = 1; i <= data_size; i++ )
		old_unAccessedData[i] = un_accessed_data[i];

	//initial the queue for containing the directly_density_reachable
	LinkQueue workQueue;
	initialQueue(&workQueue);

	int cluster_count = 0;
	int randomCoreObjectID;
	int pop_Queue_ID = 0;
	int test_counter_1 = 1;
	int test_counter_2 = 1;

	while( existCoreObject() != 0 )								//still exist core object in the @coreObject
	{
		printf("\n---------%d\n", test_counter_1);
		refreshOld_unAccessed_Set(un_accessed_data, old_unAccessedData);
		randomCoreObjectID = getRandomCoreObject();
		addToQueue_baseCoreObject(&workQueue, randomCoreObjectID);
		updateUnaccessSet(un_accessed_data, randomCoreObjectID);
		test_counter_2 = 1;
		while( !isEmptyQueue(workQueue) )
		{
			printf("\n\t++++++++++++%d\n", test_counter_2++);
			deleteQueue(&workQueue, &pop_Queue_ID);
			if( coreObject_Collection[pop_Queue_ID].reachableSize >= MinPts - 1 )
			{
				addToQueue_intersectionBased(&workQueue, un_accessed_data, pop_Queue_ID);
			}
		}
		cluster_count += 1;
		printf("\ncluster_count is %d\n", cluster_count);
		getCluster(un_accessed_data, old_unAccessedData, cluster_count);
		updateCoreObject(un_accessed_data);
		test_counter_1++;
	}
	saveNoise(un_accessed_data);
}


/*
 * the purpose of this function is to judeg whether or not exist core_object in the @coreObject
 * 	the component in the struct of coreObject is to determin the existence of the corresponding core object 
 *		return 0: non-exist
 *		return 1: exist
 * */
int existCoreObject()
{
	int core;
	for( core = 1; core <= size_of_core_object; core++ )
	{
		if( 0 == coreObject[core].capacity )
		{
			return 1;
		}
	}
	return 0;
}
/*
 *
 * */
void refreshOld_unAccessed_Set(int* un_accessed_data, int* old_unAccessedData)
{
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		old_unAccessedData[i] = un_accessed_data[i];
	}
}
/*
 * select a core_object randomly
 * 	the retuen value is the ID of selected core_object
 * */
int getRandomCoreObject()
{
	//select a core object randomly, and insert the directly_density_reachable of it into to queue.
	int i, j;
	int core_object_count = 0;
	for( i = 1; i <= size_of_core_object; i++ )
	{
		if( coreObject[i].capacity == 0 )
			core_object_count += 1;
	}
	int* auxiliaryArray;
	auxiliaryArray = (int*)malloc(sizeof(int) * (core_object_count + 1));
	if( !auxiliaryArray )
	{
		printf("auxiliaryArray malloc error!\n");
		exit(0);
	}
	
	int counter_au = 1;
	for( i = 1; i <= size_of_core_object; i++ )
	{
		if( coreObject[i].capacity == 0 )		//still have not been selected
		{
			auxiliaryArray[counter_au++] = coreObject[i].coreObjectID;
		}
	}
	int randomCoreObjectID;
	int randomIndex;
	int length = core_object_count;
	randomIndex = rand() % length + 1;
	randomCoreObjectID = auxiliaryArray[randomIndex];
	auxiliaryArray[randomIndex] = auxiliaryArray[length--];
	return randomCoreObjectID;
}
/*
 * after selected a random core_object, we need to add the directly_density_reachable of this core object to the queue
 *	particular note: instead use the coreObject, we need to use the original struct coreObject_Collection,
 *			 because of the incomplete in the index of @coreObject.
 *
 * */
void addToQueue_baseCoreObject(LinkQueue* LQ, int coreObjectID)
{
	int i;
	//printf("add to queue, the reachable of coreObject is %d\n", coreObject_Collection[coreObjectID].reachableSize);
	for( i = 1; i <= coreObject_Collection[coreObjectID].reachableSize; i++ )
	{
		insertQueue(LQ, coreObject_Collection[coreObjectID].directlyDensityReachable[i]);
	}
}
/*
 * after selected a random core_object. we need to update the information about un-accessed-set
 * 	particular note: instead use the coreObject, we need to use the original struct coreObject_Collection,
 *			 because of the incomplete in the index of @coreObject.
 * */
void updateUnaccessSet(int* un_accessed_data, int randomCoreObjectID)
{
	int i;
	for( i = 1; i <= coreObject_Collection[randomCoreObjectID].reachableSize; i++ )
	{
		un_accessed_data[coreObject_Collection[randomCoreObjectID].directlyDensityReachable[i]] = coreObject_Collection[randomCoreObjectID].directlyDensityReachable[i];
	}
	un_accessed_data[randomCoreObjectID] = randomCoreObjectID;		//core object has visited
}
/*
 * if exist the core_object in the list of directly_density_reachable of other core_object
 * add the element in the @workqueue which is not even dealed with of the core_object
 * and update the @un_accessed_set.
 * */
void addToQueue_intersectionBased(LinkQueue* LQ, int* un_accessed_set, int pop_Queue_ID)
{
	int core_DDR;			//trverse the core_directly_reachable of pop_Queue_ID
	for( core_DDR = 1; core_DDR <= coreObject_Collection[pop_Queue_ID].reachableSize; core_DDR++ )
	{
		if( 0 == un_accessed_set[coreObject_Collection[pop_Queue_ID].directlyDensityReachable[core_DDR]] )
		{
			insertQueue(LQ, coreObject_Collection[pop_Queue_ID].directlyDensityReachable[core_DDR]);
			un_accessed_set[coreObject_Collection[pop_Queue_ID].directlyDensityReachable[core_DDR]] = coreObject_Collection[pop_Queue_ID].directlyDensityReachable[core_DDR];
		}
	}
}
/*
 * get cluster based on a core object
 * */
void getCluster(int* un_accessed_data, int* old_unAccessedData, int clusterID)
{
	char filename[200];
	sprintf(filename, ".//DBSCAN_cluster//cluster_%d.data", clusterID);
	FILE* fwrite;
	if( NULL == (fwrite = fopen(filename, "w")))
	{
		printf("open file(%s) error", filename);
		exit(0);
	}
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		if( un_accessed_data[i] != old_unAccessedData[i] )
		{
			fprintf(fwrite, "%f\t%f\n", point[i].x, point[i].y);
		}
	}
	fclose(fwrite);
}
/*
 *
 * */
void updateCoreObject(int* un_accessed_data)
{
	int i;
	for( i = 1; i <= size_of_core_object; i++ )
	{
		if( 0 != un_accessed_data[coreObject[i].coreObjectID] )
		{
			coreObject[i].capacity = 1;			//denote this core object has been dealed
		}
	}
}
void saveNoise(int* un_accessed_data)
{
	FILE* fwriteNoise;
	if( NULL == (fwriteNoise = fopen(".//DBSCAN_cluster//noise.data", "w")))
	{
		printf("open file(nosie.data) error!");
		exit(0);
	}
	int i;
	printf("\nshow the noise data:\n");
	for( i = 1; i <= data_size; i++ )
	{
		if( un_accessed_data[i] == -1 || un_accessed_data[i] == 0 )
		{
			fprintf(fwriteNoise, "%f\t%f\n", point[i].x, point[i].y);
			printf("%f\t%f\n", point[i].x, point[i].y);
		}
	}
}

/*
 * some operation about queue
 * */
void initialQueue(LinkQueue* LQ)
{
	LQ->front = (QueueNodePtr)malloc(sizeof(QueueNode));
	if( !LQ->front )
	{
		printf("Queue initial malloc error!");
		exit(0);
	}
	LQ->rear = LQ->front;
	LQ->rear->next = NULL;
}
void insertQueue(LinkQueue* LQ, int pointID)
{
	QueueNode* new;
	new = (QueueNodePtr)malloc(sizeof(QueueNode));
	if( !new )
	{
		printf("insert queue malloc error %d\n", pointID);
		exit(0);
	}
	new->data = pointID;
	new->next = LQ->rear;
	LQ->rear->next = new;
	LQ->rear = new;
}
void deleteQueue(LinkQueue* LQ, int* pointID)
{
	QueueNode* p = LQ->front->next;
	*pointID = p->data;
	LQ->front->next = p->next;
	if( p == LQ->rear )
		LQ->rear = LQ->front;
	free(p);
}
void printQueue(LinkQueue LQ)
{
	if( 1 == isEmptyQueue(LQ) )
	{
		printf("\nqueue is empty\n");
		return;
	}
	LQ.front = LQ.front->next;
	while( LQ.front != LQ.rear )
	{
		printf("%d ", LQ.front->data);
		LQ.front = LQ.front->next;
	}
	printf("%d\n", LQ.front->data);
}
int isEmptyQueue(LinkQueue LQ)
{
	return LQ.front == LQ.rear ? 1 : 0;
}
//test
void testQueue()
{
	LinkQueue L;
	initialQueue(&L);
	insertQueue(&L, 1);
	insertQueue(&L, 2);
	insertQueue(&L, 3);
	insertQueue(&L, 4);
	insertQueue(&L, 5);
	printQueue(L);
	int test;
	deleteQueue(&L, &test);
	deleteQueue(&L, &test);
	deleteQueue(&L, &test);
	deleteQueue(&L, &test);
	printf("is empty = %d\n", isEmptyQueue(L));
	deleteQueue(&L, &test);
	printf("is empty = %d\n", isEmptyQueue(L));
	printQueue(L);
}

使用简单的K-means难以对下图所示的数据进行聚类。但是使用DBSCAN在给定了适当的参数时候,可以很好的对下图中的数据进行聚类。

对环形和螺线形状的数据进行聚类的运行方式如下,其中spiral.data和annulus.data中分别包含了相应图形中的点的坐标。



聚类的结果如下图

基于密度的聚类算法C语言实现--DBSCAN_第1张图片

你可能感兴趣的:(复杂网络,算法学习,基于密度的聚类算法DBSCAN,C语言实现,机器学习,算法)