大数据排序问题

程序描述:

(1)海量数据,内存不足,需要将文件分割成多个文件,从磁盘读入;

(2)将每个小文件排好序;

(3)归并每个排好序的文件,合成一个文件。


#include   
#include   //qsort
#include   
#include   
#include   
using namespace std;  


int sort_num = 10000000;	 //10M
int memory_size = 250000;    //250k


//每次只对250k个小数据量进行排序  
int read_data(FILE *fp, int *space)  
{  
	int index = 0;  
	while (index < memory_size && fscanf(fp, "%d ", &space[index]) != EOF)  
		index++;  
	return index;  
}  


void write_data(FILE *fp, int *space, int num)  
{  
	int index = 0;  
	//int j = 0;
	while (index < num)  
	{  
		
		fprintf(fp, "%10d ", space[index]);
		index++;  
		j++;
		//if(j == 100)
		//{
		//	fputc('\n',fp);	
		//	j = 0;
		//}
				
	}  
}  


// check the file pointer whether valid or not.  
void check_fp(FILE *fp)  
{  
	if (fp == NULL)  
	{  
		cout << "The file pointer is invalid!" << endl;  
		exit(1);  
	}  
}  


// void*类型
int compare(const void *first_num, const void *second_num)  
{  
	return *(int *)first_num - *(int *)second_num;  
}  


string new_file_name(int n)  
{  
	char file_name[20];  
	sprintf(file_name, "data%d.txt", n);  
	return file_name;    //file_name = data1.txt,data2.txt,...
}  


//内排序
int memory_sort()  
{  
	// open the target file.  
	FILE *fp_in_file = fopen("data.txt", "r");  
	check_fp(fp_in_file);  
	int counter = 0;  


	while (true)  
	{  
		// allocate space to store data read from file.  
		int *space = new int[memory_size];		//space[memory_size]
		int num = read_data(fp_in_file, space); //读入250K的数据
		 
		if (num == 0)  //读完10M数据
			break;  


		//调用了库函数  
		//void qsort(void *base, int nelem, unsigned int width, int ( * pfCompare)( const void *, const void *));
		qsort(space, num, sizeof(int), compare);  
		// create a new auxiliary file name.  
		string file_name = new_file_name(++counter);  


		FILE *fp_aux_file = fopen(file_name.c_str(), "w");  
		check_fp(fp_aux_file);  


		// write the orderly numbers into auxiliary file.  
		write_data(fp_aux_file, space, num);  
		//write_data(fp_aux_file, space, num);  


		cout<<"counter = "<= file_num)  
			break;							//所有文件读完
	    //min_data选取
		int min_data;
		for(int m = 0; m < file_num; m++)
		{
			if(finish[m] != 1)
			{
				min_data = first_data[m];	//选取一个没有结束的文件对应变量的值
				break;
			}
		}
		//int k;							//记录下最小值位置
		//在数组中找出最小值
		for (i = 0; i < file_num; i++)  
		{  
			if (min_data >= first_data[i] && finish[i] != 1)  
			{
				min_data = first_data[i];     
				k = i;
			}
		}    
		//cout<<" k = "<

merge_sort函数还有优化的空间

你可能感兴趣的:(C++)