外部排序 多相合并

《数据结构与算法分析——C语言描述》  第七章


初始化串很有意思。数字以文本模式存放在文件中,每个数字大小不相同,那么字符长度也不一样,想要提前知道一个文件有多少数字只能遍历一遍文件计数,硬盘读写慢的要命,就算是ssd也才读写500M/s,内存(不包括cache命中)20000M/s。但多相合并根据数字数量按照斐波那契列分配到文件中,所以一定要读一个数存一个数。


verson1

这个有个问题,文件结尾是空格,就算用feof并不能知道是文件结尾,造成迭代多一次添加了不必要的哑串。改成另一种模式,循环的进入口是是否成功读入数字,换成单循环。

void initRun(char *inputFileName) {
	int max_memory[M];

	//初始化顺序串


	FILE *ori = fopen(inputFileName, "r");

	char name[20];
	for (int i = 0; i < K; i++)//打开文件写
		file[i] = fopen(fileName(name, i), "w");

	int dummyNumCnt = 0;
	int writeNum = 0;//0表示t1,1表示t2,……
	int fibonacci[K];//K阶斐波那契数列
	initFibonacci(fibonacci);//初始化
	memset(runLen, 0, sizeof(runLen));//所有文件的顺序串数量为0
	int end = 0;
	while (!end) {
		for (writeNum = 0; writeNum < K; writeNum++) {//一个一个文件来
			while (runNum[writeNum] < fibonacci[K - 1 - writeNum]) {//每个文件逐次的满足斐波那契数列
				if (end == 0) {
					int readNum = 0;
					while (readNum < M && fscanf(ori, "%d", &max_memory[readNum]) != EOF) {
						readNum++;
					}
					if (readNum < M) {
						end = 1;
						dummyNumCnt += (M - readNum);
						for (int i = readNum; i < M; i++)
							max_memory[i] = 0;
					}
					quickSort_my(max_memory, M);
					//交替写到tb1或写到tb2……
					write(max_memory, readNum, file[writeNum]);
				}
				else {
					memset(max_memory, 0, sizeof(max_memory));
					write(max_memory, M, file[writeNum]);
					dummyNumCnt += M;
				}
				runNum[writeNum]++;
			}
		}
		updateFibonacci(fibonacci);
	}
	fclose(ori);
	for (int i = 0; i < K; i++)
		fclose(file[i]);
}



verson2

实在想不到怎样用K路合并,想到的是一个文件对另外K-1个文件进行2路合并,宏定义的K为3的时候没有问题。K改成4、其他的就出问题了。代码写得一坨屎。。。这个断断续续写了我六七天了。深感智商之捉急。

想了一下,K大于等于4出问题的原因是,仅靠最长文件序号、写的序号、读的序号是不能判断一次循环哪些文件是处理过的,得用表示。


#include <stdio.h>
#include <stdlib.h>
#include<string.h>
#include<queue>
#include"fatal.h"
#define M 3
#define K 3


typedef int ElementType;

void insertionSort(int *a, int n) {
	int j, p;
	int temp;

	for (p = 1; p < n; p++) {
		temp = a[p];
		for (j = p; j > 0 && temp < a[j - 1]; j--)
			a[j] = a[j - 1];
		a[j] = temp;
	}
}

void swap_my(ElementType *a, ElementType *b) {
	ElementType temp;
	temp = *a;
	*a = *b;
	*b = temp;
}

ElementType median3(ElementType a[], int left, int right) {
	int center = (left + right) / 2;
	if (a[left] > a[center])
		swap_my(&a[left], &a[center]);
	if (a[left] > a[right])
		swap_my(&a[left], &a[right]);
	if (a[center] > a[right])
		swap_my(&a[center], &a[right]);
	swap_my(&a[center], &a[right - 1]);
	return a[right - 1];
}




#define CUTOFF (3)  

void qsort_my(ElementType a[], int left, int right) {
	if (left + CUTOFF <= right) {
		int i, j;
		ElementType pivot;
		pivot = median3(a, left, right);

		i = left;
		j = right - 1;
		while (1) {
			while (a[++i] < pivot) {}

			while (a[--j] > pivot) {}

			if (i < j)
				swap_my(&a[i], &a[j]);
			else
				break;
		}
		swap_my(&a[i], &a[right - 1]);
		qsort_my(a, left, i - 1);
		qsort_my(a, i + 1, right);
	}
	else
		insertionSort(a + left, right - left + 1);
}

void quickSort_my(ElementType a[], int n) {
	qsort_my(a, 0, n - 1);
}

FILE* file[K + 1];//文件指针数组
int runLen[K + 1];//每个文件对应的顺序串长度
int runNum[K + 1];//每个文件的顺序串数量
char name[200];//生成的名字

void write(int *a, int n, FILE *out) {
	for (int i = 0; i < n; i++) {
		fprintf(out, "%d ", a[i]);
	}
}


char* fileName(char *buf, int i) {
	strcpy(buf, "T");
	char num[5];
	strcat(buf, _itoa(i + 1, num, 10));
	return buf;
}

typedef std::pair<int, int> Pair_int;
auto cmp = [](const Pair_int& left, const Pair_int& right) { return (left.first) > (right.first); };//lambda表达式,算是一种比较精简的函数吧


int RandInt(int i, int j) {
	int temp;
	temp = (int)(i + (1.0*rand() / RAND_MAX)*(j - i));
	return temp;
}

void getRandomInt(int *A, int n) {
	for (int i = 0; i < n; i++) {
		A[i] = i + 1;
	}
	for (int i = 1; i < n; i++) {
		//std::swap(A[i], A[RandInt(0, i)]);      
		int randAdrr = RandInt(0, i);
		int t = A[i];
		A[i] = A[randAdrr];
		A[randAdrr] = t;
	}
}

#define N 100
void writeRandIntToFile() {
	int a[N];
	getRandomInt(a, N);
	FILE *fp = fopen("ta1", "w");
	for (int &i : a)
		fprintf(fp, "%d ", i);
	fclose(fp);
}


void initFibonacci(int *arr) {//给K阶的斐波那契数列的初始化
	int i;
	for (i = 0; i < K - 2; i++)
		arr[i] = 0;
	arr[i] = 1;//k-2
	arr[i + 1] = 1;//k-1
}

void updateFibonacci(int *arr) {
	int sum = 0;
	for (int i = 0; i < K - 1; i++) {
		sum += arr[i];
		arr[i] = arr[i + 1];
	}
	arr[K - 1] += sum;
}


void handleRun(int &readNum, int &writeNum, int *fibonacci, int *max_memory) {
	readNum = 0;
	quickSort_my(max_memory, M);
	if (runNum[writeNum] < fibonacci[K - 1 - writeNum]) {
		write(max_memory, M, file[writeNum]);
		runNum[writeNum]++;
		if (runNum[writeNum] == fibonacci[K - 1 - writeNum]) {//判断是否写满了当前迭代的斐波那契数列
			writeNum++;
			if (writeNum == K) {
				updateFibonacci(fibonacci);
				writeNum = 0;
			}
		}
	}
	else {//当前的允许顺序串数量为0,进行下一次迭代
		writeNum = 0;
		updateFibonacci(fibonacci);
		handleRun(readNum, writeNum, fibonacci, max_memory);
	}


}

void initRun(char *inputFileName) {
	int max_memory[M];//模拟的最大内存

	//初始化顺序串


	FILE *ori = fopen(inputFileName, "r");

	char name[20];
	for (int i = 0; i < K; i++)//打开文件写
		file[i] = fopen(fileName(name, i), "w");

	int dummyNumCnt = 0;//记录的哑元数量
	int writeNum = 0;//0表示t1,1表示t2,……
	int fibonacci[K];//K阶斐波那契数列
	initFibonacci(fibonacci);//初始化

	for (int i = 0; i < K; i++)//每个文件的顺序串长度初始为M
		runLen[i] = M;


	int readNum = 0;//最大内存的下标

	while (fscanf(ori, "%d", &max_memory[readNum]) != EOF) {
		readNum++;
		if (readNum < M)//还没读满最大的内存
			continue;

		handleRun(readNum, writeNum, fibonacci, max_memory);

	}
	if (readNum != 0) {//一个顺序串未读满,补上0
		dummyNumCnt += (M - readNum);
		while (readNum < M)
			max_memory[readNum++] = 0;
		handleRun(readNum, writeNum, fibonacci, max_memory);
	}
	memset(max_memory, 0, M*sizeof(int));//初始化哑串
	while (writeNum < K) {
		if (runNum[writeNum] < fibonacci[K - 1 - writeNum]) {//是否写满了当前迭代的斐波那契数列
			write(max_memory, M, file[writeNum]);
			runNum[writeNum]++;
		}
		else {
			writeNum++;
		}
	}
	fclose(ori);
	for (int i = 0; i < K; i++)
		fclose(file[i]);
}

int isFinish() {


	int cnt = 0;
	for (int i = 0; i < K + 1; i++) {
		if (runNum[i] >= 1)
			cnt++;
		if (cnt >= 2) {
			return 0;
		}
	}
	return 1;

}

void mergeRun(int longest, int read, int write) {
	int i, j;
	int a, b;
	int hasNum1 = 0, hasNum2 = 0;
	for (i = 0, j = 0; i < runLen[longest] && j < runLen[read];) {
		if (hasNum1 == 0) {
			fscanf(file[longest], "%d", &a);
			hasNum1 = 1;
		}
		if (hasNum2 == 0) {
			fscanf(file[read], "%d", &b);
			hasNum2 = 1;
		}
		if (a < b) {
			/*if (a < 0)
				Error("error1");*/
			fprintf(file[write], "%d ", a);
			hasNum1 = 0;
			i++;
		}
		else {
			/*if (b < 0)
				Error("error2");*/
			fprintf(file[write], "%d ", b);
			hasNum2 = 0;
			j++;
		}
	}
	while (i < runLen[longest]) {
		if (hasNum1 == 0)
			fscanf(file[longest], "%d", &a);
		fprintf(file[write], "%d ", a);
		hasNum1 = 0;
		i++;
	}
	while (j < runLen[read]) {
		if (hasNum2 == 0)
			fscanf(file[read], "%d", &b);
		fprintf(file[write], "%d ", b);
		hasNum2 = 0;
		j++;
	}
}


int nextReadNum(int longestNum, int writeNum, int now) {
	for (int i = now + 1; i < K + 1; i++)
		if (runNum[i]>0 && i != longestNum && i != writeNum)
			return i;
	return -1;
}

int firstReadNum(int longestNum, int writeNum) {
	int max = 0;
	int pos = -1;
	for (int i = 0; i < K + 1; i++)
		if (runNum[i]>max && i != longestNum && i != writeNum) 			{
			pos = i;
			max = runNum[i];
		}
	return pos;
}

int main() {


	writeRandIntToFile();
	char inputFileName[20] = "ta1";
	//scanf("%s", inputFileName);
	initRun(inputFileName);

	int oldLongestNum=-1;
	int longestNum = 0;
	int nextLongestNum;
	int writeNum = K;
	int readNum = 1;


	int testCnt = 0;
	int oldLongestOpenTag = 0;
	while (!isFinish()) {
		int cnt = 0;
		file[longestNum] = fopen(fileName(name, longestNum), "r");
		nextLongestNum = writeNum;




		while (cnt < K - 1 && !isFinish()) {
			cnt++;
			file[writeNum] = fopen(fileName(name, writeNum), "w");
			if (readNum != oldLongestNum || (readNum == oldLongestNum && oldLongestOpenTag == 0))
				file[readNum] = fopen(fileName(name, readNum), "r");
			//runNum[writeNum] = 0;
			while (runNum[readNum] > 0) {
				mergeRun(longestNum, readNum, writeNum);
				runNum[longestNum]--;
				runNum[readNum]--;
				runNum[writeNum]++;
				
				testCnt++;
				

			}
			runLen[writeNum] = runLen[readNum] + runLen[longestNum];



			fclose(file[writeNum]);
			int oldwriteNum = writeNum;
			writeNum = readNum;
			fclose(file[readNum]);

			readNum = nextReadNum(longestNum, oldwriteNum, readNum);//???
			if(readNum==-1)
				readNum= nextReadNum(longestNum, oldwriteNum, -1);



		}
		if (runNum[longestNum] == 0) {
			fclose(file[longestNum]);
			oldLongestOpenTag = 0;
		}
		else
			oldLongestOpenTag = 1;
		oldLongestNum = longestNum;
		longestNum = nextLongestNum;
		readNum = firstReadNum(longestNum, writeNum);//findBiggesetNum
	}



}


verson3

算是终结这个了,用了两个堆,一个队列来表示未处理的文件,已处理的文件,空的文件。逻辑很清晰,代码很优美。

#include <stdio.h>
#include <stdlib.h>
#include<string.h>
#include<queue>
#include"fatal.h"
#define M 3//最大的内存
#define K 8//K路排序
#define N 222//要排序的数字量,1—N

typedef int ElementType;


void insertionSort(int *a, int n) {
	int j, p;
	int temp;

	for (p = 1; p < n; p++) {
		temp = a[p];
		for (j = p; j > 0 && temp < a[j - 1]; j--)
			a[j] = a[j - 1];
		a[j] = temp;
	}
}

void swap_my(ElementType *a, ElementType *b) {
	ElementType temp;
	temp = *a;
	*a = *b;
	*b = temp;
}

ElementType median3(ElementType a[], int left, int right) {
	int center = (left + right) / 2;
	if (a[left] > a[center])
		swap_my(&a[left], &a[center]);
	if (a[left] > a[right])
		swap_my(&a[left], &a[right]);
	if (a[center] > a[right])
		swap_my(&a[center], &a[right]);
	swap_my(&a[center], &a[right - 1]);
	return a[right - 1];
}




#define CUTOFF (3)  

void qsort_my(ElementType a[], int left, int right) {
	if (left + CUTOFF <= right) {
		int i, j;
		ElementType pivot;
		pivot = median3(a, left, right);

		i = left;
		j = right - 1;
		while (1) {
			while (a[++i] < pivot) {}

			while (a[--j] > pivot) {}

			if (i < j)
				swap_my(&a[i], &a[j]);
			else
				break;
		}
		swap_my(&a[i], &a[right - 1]);
		qsort_my(a, left, i - 1);
		qsort_my(a, i + 1, right);
	}
	else
		insertionSort(a + left, right - left + 1);
}

void quickSort_my(ElementType a[], int n) {
	qsort_my(a, 0, n - 1);
}

FILE* file[K + 1];//文件指针数组
int runLen[K + 1];//每个文件对应的顺序串长度
int runNum[K + 1];//每个文件的顺序串数量
char name[200];//生成的名字


typedef std::pair<int, int> Pair_int;//first是序号,second是runNum
auto cmp = [](const Pair_int& left, const Pair_int& right) { return (left.second) < (right.second); };//lambda表达式,算是一种比较精简的函数吧,比较的位置是pair的第二个
std::queue<int> nullFile;//空文件列表
std::priority_queue<Pair_int, std::vector<Pair_int>, decltype(cmp)>fileHeap1(cmp);//这里不知道怎么用数组
std::priority_queue<Pair_int, std::vector<Pair_int>, decltype(cmp)>fileHeap2(cmp);

void write(int *a, int n, FILE *out) {
	for (int i = 0; i < n; i++) {
		fprintf(out, "%d ", a[i]);
	}
}


char* fileName(char *buf, int i) {
	strcpy(buf, "T");
	char num[5];
	strcat(buf, _itoa(i + 1, num, 10));
	return buf;
}




int RandInt(int i, int j) {
	int temp;
	temp = (int)(i + (1.0*rand() / RAND_MAX)*(j - i));
	return temp;
}

void getRandomInt(int *A, int n) {
	for (int i = 0; i < n; i++) {
		A[i] = i + 1;
	}
	for (int i = 1; i < n; i++) {
		//std::swap(A[i], A[RandInt(0, i)]);      
		int randAdrr = RandInt(0, i);
		int t = A[i];
		A[i] = A[randAdrr];
		A[randAdrr] = t;
	}
}


void writeRandIntToFile() {
	int a[N];
	getRandomInt(a, N);
	FILE *fp = fopen("ta1", "w");
	for (int &i : a)
		fprintf(fp, "%d ", i);
	fclose(fp);
}


void initFibonacci(int *arr) {//给K阶的斐波那契数列的初始化
	int i;
	for (i = 0; i < K - 2; i++)
		arr[i] = 0;
	arr[i] = 1;//k-2
	arr[i + 1] = 1;//k-1
}

void updateFibonacci(int *arr) {
	int sum = 0;
	for (int i = 0; i < K - 1; i++) {
		sum += arr[i];
		arr[i] = arr[i + 1];
	}
	arr[K - 1] += sum;
}


void handleRun(int &readNum, int &writeNum, int *fibonacci, int *max_memory) {
	readNum = 0;
	quickSort_my(max_memory, M);
	if (runNum[writeNum] < fibonacci[K - 1 - writeNum]) {
		write(max_memory, M, file[writeNum]);
		runNum[writeNum]++;
		if (runNum[writeNum] == fibonacci[K - 1 - writeNum]) {//判断是否写满了当前迭代的斐波那契数列
			writeNum++;
			if (writeNum == K) {
				updateFibonacci(fibonacci);
				writeNum = 0;
			}
		}
	}
	else {//当前的允许顺序串数量为0,进行下一次迭代
		writeNum = 0;
		updateFibonacci(fibonacci);
		handleRun(readNum, writeNum, fibonacci, max_memory);
	}


}

void initRun(char *inputFileName) {
	int max_memory[M];//模拟的最大内存

	//初始化顺序串


	FILE *ori = fopen(inputFileName, "r");

	char name[20];
	for (int i = 0; i < K; i++)//打开文件写
		file[i] = fopen(fileName(name, i), "w");

	int dummyNumCnt = 0;//记录的哑元数量
	int writeNum = 0;//0表示t1,1表示t2,……
	int fibonacci[K];//K阶斐波那契数列
	initFibonacci(fibonacci);//初始化

	for (int i = 0; i < K; i++)//每个文件的顺序串长度初始为M
		runLen[i] = M;


	int readNum = 0;//最大内存的下标

	while (fscanf(ori, "%d", &max_memory[readNum]) != EOF) {
		readNum++;
		if (readNum < M)//还没读满最大的内存
			continue;

		handleRun(readNum, writeNum, fibonacci, max_memory);

	}
	if (readNum != 0) {//一个顺序串未读满,补上0
		dummyNumCnt += (M - readNum);
		while (readNum < M)
			max_memory[readNum++] = 0;
		handleRun(readNum, writeNum, fibonacci, max_memory);
	}
	memset(max_memory, 0, M*sizeof(int));//初始化哑串
	while (writeNum < K) {
		if (runNum[writeNum] < fibonacci[K - 1 - writeNum]) {//是否写满了当前迭代的斐波那契数列
			write(max_memory, M, file[writeNum]);
			runNum[writeNum]++;
		}
		else {
			writeNum++;
		}
	}
	fclose(ori);
	for (int i = 0; i < K; i++) {
		fclose(file[i]);
		if (runNum[i] > 0) 			{
			fileHeap1.push(std::make_pair(i,runNum[i]));
		}
		else {
			nullFile.push(i);//把空的放到空文件队列中
		}
	}
	nullFile.push(K);//第K+1个磁带是空的
}



void mergeRun(int longest, int read, int write) {
	int i, j;
	int a, b;
	int hasNum1 = 0, hasNum2 = 0;
	for (i = 0, j = 0; i < runLen[longest] && j < runLen[read];) {
		if (hasNum1 == 0) {
			fscanf(file[longest], "%d", &a);
			hasNum1 = 1;
		}
		if (hasNum2 == 0) {
			fscanf(file[read], "%d", &b);
			hasNum2 = 1;
		}
		if (a < b) {
			/*if (a < 0)
				Error("error1");*/
			fprintf(file[write], "%d ", a);
			hasNum1 = 0;
			i++;
		}
		else {
			/*if (b < 0)
				Error("error2");*/
			fprintf(file[write], "%d ", b);
			hasNum2 = 0;
			j++;
		}
	}
	while (i < runLen[longest]) {
		if (hasNum1 == 0)
			fscanf(file[longest], "%d", &a);
		fprintf(file[write], "%d ", a);
		hasNum1 = 0;
		i++;
	}
	while (j < runLen[read]) {
		if (hasNum2 == 0)
			fscanf(file[read], "%d", &b);
		fprintf(file[write], "%d ", b);
		hasNum2 = 0;
		j++;
	}
}




int main() {
	writeRandIntToFile();
	char inputFileName[20] = "ta1";//要排序的文件
	//scanf("%s", inputFileName);
	initRun(inputFileName);//初始化顺序串
	

	
	auto * notHandle = &fileHeap1;//没有处理的
	auto * hasHandle = &fileHeap2;//已经处理的
	int oldLongestNum;
	int longestNum;
	
	int writeNum;
	int readNum;

	
	int oldLongestOpenTag = 0;
	while ((*notHandle).size()>1) {//直到合并为1个文件
		
		//打开最长的文件
		longestNum = (*notHandle).top().first;
		(*notHandle).pop();
		file[longestNum] = fopen(fileName(name, longestNum), "r");

		while (!(*notHandle).empty()) {//合并剩下的文件
			
			writeNum = nullFile.front();//弹出一个空文件
			nullFile.pop();
			file[writeNum] = fopen(fileName(name, writeNum), "w");

			readNum = (*notHandle).top().first;//弹出一个文件
			(*notHandle).pop();

			if (readNum != oldLongestNum || (readNum == oldLongestNum && oldLongestOpenTag == 0))//上次的迭代最长的文件可能没读完,不需要重新打开
				file[readNum] = fopen(fileName(name, readNum), "r");

			while (runNum[readNum] > 0) {//把文件合并完
				mergeRun(longestNum, readNum, writeNum);
				runNum[longestNum]--;
				runNum[readNum]--;
				runNum[writeNum]++;
			}
			runLen[writeNum] = runLen[readNum] + runLen[longestNum];//合并后顺序串为原来的两个顺序串长度加起来


			(*hasHandle).push({ writeNum,runNum[writeNum] });//合并的,放到已处理中,下一次的循环的时候再处理
			fclose(file[writeNum]);

			nullFile.push(readNum);//读完就是空文件了,放到空文件队列中
			fclose(file[readNum]);
		}
		if (runNum[longestNum] == 0) {//看看最长的文件读完没
			nullFile.push(longestNum);//读完了
			fclose(file[longestNum]);
			oldLongestOpenTag = 0;
		}
		else {
			oldLongestOpenTag = 1;//没读完
			(*hasHandle).push({ longestNum,runNum[longestNum] });//放到已经处理的文件堆中
		}
			
		oldLongestNum = longestNum;//记录这次的最长的文件


		std::swap(notHandle, hasHandle);//交换指针,这次已处理完的文件成为下一次未处理的文件
	}



}


你可能感兴趣的:(外部排序 多相合并)