(10)大数据外存归并排序

#include 
#include 
#include 

using namespace std;


// 可以尝试先写测试用例...
void TestDiskSort();

/*
	输入:一个最多含有n个不重复的正整数(也就是说可能含有少于n个不重复正整数)的文件,
		其中每个数都小于等于n,且n=10^7。 
	输出:得到按从小到大升序排列的包含所有输入的整数的列表。 
	条件:最多有大约1MB的内存空间可用,但磁盘空间足够。
		且要求运行时间在5分钟以下,10秒为最佳结果。
	10^7:大约1千万
	1M:大约1百万
*/
/*
	1.数据划分份数的考虑
	2.每一份有多少块的考虑
	3.每一块有多少数据的考虑

	N份数据并行归并排序
	一块数据一起读入内存

	考虑份数:2^N	8(16)

	每一块大小:1K(1024个数据)

	数据的来源:应该是文件
	1.读入数据,并分配到8个文件中去
		a)数据总个数不超过10^7
	2.对8个文件进行内部排序
		a)内部排序还需要再划分,归并排序
	3.对8个文件进行归并排序

*/

/*
	位图方案:
	牛逼!
	用一个超大字符串将所有的数据都扫进去,然后顺次读取为1的位!
	位图方法的抽象:
		线性记录每一个数出现的次数。
		适用范围:数据量大,且数据范围不太大。
	不用位图用数组索引,是一样的道理,而且可以处理数据重复出现的情况
	针对此问题,将数据线性划分为16块,分16次进行。。。
*/
/*
	因为可能是位图大小的限制,因此用了两趟扫描的方法
	第一趟扫描小于Max_length,第二趟扫描Max_length到
	Num的
*/

// 一千万
const unsigned MAX_NUM = 10000000;

// 每个小文件的大小,默认为10万
const unsigned MAX_LITTLE_FILE_LENGTH = 100000;
// 小文件的前缀名
const string PRE_FILE_NAME = "c:/test/data";

// 位图索引的最大值
const unsigned MAX_BIT_SIZE = MAX_NUM / 2;
// 随机生成num大小个数,并保存在fileName中, 有重复数据
// 大小从0到Max
void InitNumFile(const char *fileName, unsigned num, unsigned MAX_NUM);

// 无重复数据
void InitNumFile_2(const char *fileName, unsigned num, unsigned MAX_NUM);

// 使用位图排序
void DiskSortUsingBitSet(const char *fileName, char *outFileName);

/*
	基于数组、链表的排序
	内、外排序
*/

/*
	K路归并排序
*/
void DiskSortUsingKMerge(const char *fileName, const char *outFileName);
// 分割fileName,并将文件个数传出了
void DivideFile(const char *fileName, unsigned &fileNum);

// 对文件进行内部排序, 并把排序结果重新写入到文件中
void SortFile(const char *fileName);

// 传入qsort的比较函数
int CompareInt(const void *left, const void * right);

void KMergeSort(const char *fileName, fstream *everyFile, unsigned fileNum);
//void KMergeSort(const char *fileName, FILE **everyFile, unsigned fileNum);

// 传入file和i,生成.txt
// file1.txt
string MakeFileName(string file, unsigned i);

// 找到数组中的最小值,并返回index
void FindMin(int *arr, bool *isAlive, unsigned size, unsigned &index);

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include "10_DiskSort.h"

const int TEST_NUM = 10;

void TestDiskSort()
{
	// 测试生成随机数,生成100万个数据
	unsigned num = 1000000;
	char dataFile[] = "c:/test/data.txt";
	char orderFile[] = "c:/test/order.txt";
	InitNumFile(dataFile, num, MAX_NUM);
	//DiskSortUsingBitSet(dataFile, orderFile);
	/*unsigned fileNum = 0;
	DivideFile(dataFile, fileNum);
	cout << fileNum << endl;*/

	DiskSortUsingKMerge(dataFile, orderFile);
}

void InitNumFile(const char *fileName, unsigned num, unsigned MAX_NUM)
{
	// 生成
	ofstream outFile(fileName);
	assert(outFile);
	srand(time(NULL));
	int *arr = new int[MAX_NUM + 1];
	for (unsigned i = 1; i <= MAX_NUM; ++i)
	{
		arr[i] = i;
	}
	for(unsigned i = 0; i < MAX_NUM; ++i)
	{		
		//int randNumOne = int((((double)rand()/RAND_MAX) * MAX_NUM) + 1) % MAX_NUM;
		//int randNumTwo = int((((double)rand()/RAND_MAX) * MAX_NUM) + 1) % MAX_NUM;
		int randNumOne = (rand() * RAND_MAX + rand()) % MAX_NUM;
		int randNumTwo = (rand() * RAND_MAX + rand()) % MAX_NUM;
		swap(arr[randNumOne], arr[randNumTwo]);		
	}
	for (unsigned i = 1; i <= num; ++i)
	{
		outFile << arr[i] << " ";
	}
	outFile.close();
	delete arr;
}

void InitNumFile_2(const char *fileName, unsigned num, unsigned MAX_NUM)
{
	ofstream outFile(fileName);
	assert(outFile);
	srand(time(NULL));
	// 记录最大的数
	// int max = 0;
	//cout << RAND_MAX << endl;
	for(unsigned i = 0; i < num; ++i)
	{
		/*int tmp = rand();
		tmp = tmp % MAX_NUM;*/
		int randNum = (((double)rand()/RAND_MAX) * MAX_NUM);
		randNum = randNum % MAX_NUM;
		/*if (randNum > max)
		{
			max = randNum;
		}*/
		outFile << randNum << " ";
	}
	outFile.close();
	//cout << max << endl;
}

// 
void DiskSortUsingBitSet(const char *fileName, const char *outFileName)
{
	clock_t start = clock();
	// 1.构造bitset
	bitset bits(0);
	fstream infile(fileName);
	ofstream outfile(outFileName);
	assert(infile && outfile);
	int num;
	while (infile >> num)
	{
		if (num < MAX_BIT_SIZE)
		{
			bits[num] = 1;
		}
	}
	for (unsigned i = 0; i < MAX_BIT_SIZE; ++i)
	{
		if (bits[i] == 1)
		{
			outfile << i << " ";
		}
	}
	infile.close();
	infile.open(fileName);
	//infile.seekp(0, ios::beg);
	//infile.seekg(0, ios::beg);
	bits.reset();
	while (infile >> num)
	{
		if (num >= MAX_BIT_SIZE && num < 2 * MAX_BIT_SIZE)
		{
			bits[num - MAX_BIT_SIZE] = 1;
		}
	}
	for (unsigned i = 0; i < MAX_BIT_SIZE; ++i)
	{
		if (bits[i] == 1)
		{
			outfile << (i + MAX_BIT_SIZE) << " ";
		}
	}
		
	infile.close();
	outfile.close();

	clock_t end = clock();
	int second = (end - start) / CLOCKS_PER_SEC;
	cout << "一共用时: " << second << " s" << endl;
}

/*
	1.读取数据并分割成长度为M的小文件
	2.对大小为M的文件进行内存排序
	3.对大小为M的文件(最后一个可能小于M)进行多路归并排序

	每个小文件的命名规则:data1、data2、data3...
*/
void DiskSortUsingKMerge(const char *fileName, const char *outFileName)
{
	// 1.分割文件
	unsigned fileNum;
	DivideFile(fileName, fileNum);
	// 2.对每个文件进行排序
	for (unsigned i = 1; i <= fileNum; ++i)
	{
		string curFileName = MakeFileName(PRE_FILE_NAME, i);
		SortFile(curFileName.c_str());
	}
	// 用于保存每个文件的读取指针
	// ifstream file[]
	//fstream **everyFile = new fstream* [fileNum];

	//fstream *everyFile = new fstream [fileNum];
	//FILE **farray = new FILE*[fileNum];
	fstream *farray = new fstream [fileNum];

	int tmp = 0;
	
	// vector everyFile;
	for (unsigned i = 1; i <= fileNum; ++i)
	{
		string curFileName = MakeFileName(PRE_FILE_NAME, i);

		// fstream 直接赋值不行!!!
		farray[i].open(curFileName, ios::in | ios::out);

		//fstream file(curFileName);
		//FILE *file = fopen(curFileName.c_str(), "rt");
		//fstream file(curFileName);
		//assert(file);
		//farray[i - 1] = file;
		//everyFile[i - 1] = &file;
		//everyFile[i - 1] = file;

		//(*(everyFile[0])) >> tmp;
		//(*(everyFile[i - 1])) >> tmp;
		//*everyFile[0] >> tmp;
		//*everyFile[1] >> tmp;
		//*everyFile[i] >> tmp;
	}
// test
	
	

	KMergeSort(outFileName, farray, fileNum);

	//for (unsigned i = 1; i <= fileNum; ++i)
	//{
	//	if (everyFile[i - 1]->is_open())
	//	{
	//		everyFile[i - 1]->close();
	//	}
	//}
	
	delete []farray;

}

string MakeFileName(string file, unsigned i)
{
	string fileName = file;
	char buffer[20];
	itoa(i, buffer, 10);
	fileName.append(buffer);
	fileName += ".txt";
	return fileName;
}

//void KMergeSort(const char *fileName, FILE **everyFile, unsigned fileNum)
void KMergeSort(const char *fileName, fstream *everyFile, unsigned fileNum)
{
	fstream outFile(fileName);
	assert(outFile);
	// 用于保存每个文件是否还有数据
	bool *isFileLive = new bool[fileNum];
	// 用于保存每个文件的第一个数字(最小数)
	int *everyNum = new int[fileNum];

	for (unsigned i = 0; i < fileNum; ++i)
	{
		isFileLive[i] = true;
		int tmp;

		//(*(everyFile[i])) >> tmp;
		//everyFile[i] >> tmp;
		//fscanf(everyFile[i], "%d", &tmp);
		everyFile[i] >> tmp;

		//*everyFile[i] >> tmp;
		everyNum[i] = tmp;
	}
	
	int liveFile = fileNum;
	unsigned index = 0;
	while (liveFile > 0)
	{
		FindMin(everyNum, isFileLive, fileNum, index);
		outFile << everyNum[index] << " ";
		if (isFileLive[index])
		{
			int tmp;
			//if (fscanf(everyFile[index], "%d", &tmp) != EOF)
			if (everyFile[index] >> tmp)
			{
				everyNum[index] = tmp;
			}
			else
			{
				isFileLive[index] = false;
				liveFile--;
			}
		}
	}
	

	delete []isFileLive;
	delete []everyNum;

	outFile.close();
}

void FindMin(int *arr, bool *isAlive, unsigned size, unsigned &index)
{
	int tmp;
	
	for (unsigned i = 0; i < size; ++i)
	{
		if (isAlive[i])
		{
			tmp = arr[i];
			index = i;
			break;
		}
	}
	
	for (unsigned i = index + 1; i < size; ++i)
	{
		if (isAlive[i] && (arr[i] < tmp))
		{
			tmp = arr[i];
			index = i;
		}
	}
}

// 每个小文件的命名规则:data1、data2、data3...
void DivideFile(const char *fileName, unsigned &fileNum)
{
	// 当前小文件数
	unsigned curFileNum = 1;
	// 当前数字的个数
	unsigned curNumCount = 0;
	string curFileName  = MakeFileName(PRE_FILE_NAME, 1);

	ifstream infile(fileName);
	ofstream outfile(curFileName);
	assert(infile);
	assert(outfile);

	int num;
	while (infile >> num)
	{
		curNumCount++;
		outfile << num << " ";
		// 如果到达M个,则需要关闭当前文件,另开一个新的文件
		if (curNumCount % MAX_LITTLE_FILE_LENGTH == 0)
		{			
			outfile.close();
			curFileNum++;
			// 构造新的文件名
			curFileName  = MakeFileName(PRE_FILE_NAME, curFileNum);

			outfile.open(curFileName);
			assert(outfile);		
		}		
	}
	outfile.close();
	// 如果最后一个文件一个数字也没有
	if (curNumCount % MAX_LITTLE_FILE_LENGTH == 0)
	{
		curFileNum--;
		_unlink(curFileName.c_str());
	}
	fileNum = curFileNum;
}

void SortFile(const char *fileName)
{
	// 要不直接开一个大数组,用sort完事, 在堆上分配
	int *arr = new int[MAX_LITTLE_FILE_LENGTH];
	fstream file(fileName);
	assert(file);
	int num;
	unsigned index = 0;
	while (file >> num)
	{
		arr[index++] = num;
		assert(index <= MAX_LITTLE_FILE_LENGTH);
	}
	qsort(arr, index, sizeof(int), CompareInt);
	// file.seekg(0, ios::beg);
	file.close();
	file.open(fileName);
	assert(file);
	for (unsigned i = 0; i < index; ++i)
	{
		file << arr[i] << " ";
	}
	file.close();
	delete arr;
}

int CompareInt(const void *left, const void * right)
{
	return (*(int *)left - *(int *)right);
}


你可能感兴趣的:(算法)