#include
#include
#include
using namespace std;
// 可以尝试先写测试用例...
void TestDiskSort();
/*
输入:一个最多含有n个不重复的正整数(也就是说可能含有少于n个不重复正整数)的文件,
其中每个数都小于等于n,且n=10^7。
输出:得到按从小到大升序排列的包含所有输入的整数的列表。
条件:最多有大约1MB的内存空间可用,但磁盘空间足够。
且要求运行时间在5分钟以下,10秒为最佳结果。
10^7:大约1千万
1M:大约1百万
*/
/*
1.数据划分份数的考虑
2.每一份有多少块的考虑
3.每一块有多少数据的考虑
N份数据并行归并排序
一块数据一起读入内存
考虑份数:2^N 8(16)
每一块大小:1K(1024个数据)
数据的来源:应该是文件
1.读入数据,并分配到8个文件中去
a)数据总个数不超过10^7
2.对8个文件进行内部排序
a)内部排序还需要再划分,归并排序
3.对8个文件进行归并排序
*/
/*
位图方案:
牛逼!
用一个超大字符串将所有的数据都扫进去,然后顺次读取为1的位!
位图方法的抽象:
线性记录每一个数出现的次数。
适用范围:数据量大,且数据范围不太大。
不用位图用数组索引,是一样的道理,而且可以处理数据重复出现的情况
针对此问题,将数据线性划分为16块,分16次进行。。。
*/
/*
因为可能是位图大小的限制,因此用了两趟扫描的方法
第一趟扫描小于Max_length,第二趟扫描Max_length到
Num的
*/
// 一千万
const unsigned MAX_NUM = 10000000;
// 每个小文件的大小,默认为10万
const unsigned MAX_LITTLE_FILE_LENGTH = 100000;
// 小文件的前缀名
const string PRE_FILE_NAME = "c:/test/data";
// 位图索引的最大值
const unsigned MAX_BIT_SIZE = MAX_NUM / 2;
// 随机生成num大小个数,并保存在fileName中, 有重复数据
// 大小从0到Max
void InitNumFile(const char *fileName, unsigned num, unsigned MAX_NUM);
// 无重复数据
void InitNumFile_2(const char *fileName, unsigned num, unsigned MAX_NUM);
// 使用位图排序
void DiskSortUsingBitSet(const char *fileName, char *outFileName);
/*
基于数组、链表的排序
内、外排序
*/
/*
K路归并排序
*/
void DiskSortUsingKMerge(const char *fileName, const char *outFileName);
// 分割fileName,并将文件个数传出了
void DivideFile(const char *fileName, unsigned &fileNum);
// 对文件进行内部排序, 并把排序结果重新写入到文件中
void SortFile(const char *fileName);
// 传入qsort的比较函数
int CompareInt(const void *left, const void * right);
void KMergeSort(const char *fileName, fstream *everyFile, unsigned fileNum);
//void KMergeSort(const char *fileName, FILE **everyFile, unsigned fileNum);
// 传入file和i,生成.txt
// file1.txt
string MakeFileName(string file, unsigned i);
// 找到数组中的最小值,并返回index
void FindMin(int *arr, bool *isAlive, unsigned size, unsigned &index);
#include
#include
#include
#include
#include
#include
#include
#include
#include "10_DiskSort.h"
const int TEST_NUM = 10;
void TestDiskSort()
{
// 测试生成随机数,生成100万个数据
unsigned num = 1000000;
char dataFile[] = "c:/test/data.txt";
char orderFile[] = "c:/test/order.txt";
InitNumFile(dataFile, num, MAX_NUM);
//DiskSortUsingBitSet(dataFile, orderFile);
/*unsigned fileNum = 0;
DivideFile(dataFile, fileNum);
cout << fileNum << endl;*/
DiskSortUsingKMerge(dataFile, orderFile);
}
void InitNumFile(const char *fileName, unsigned num, unsigned MAX_NUM)
{
// 生成
ofstream outFile(fileName);
assert(outFile);
srand(time(NULL));
int *arr = new int[MAX_NUM + 1];
for (unsigned i = 1; i <= MAX_NUM; ++i)
{
arr[i] = i;
}
for(unsigned i = 0; i < MAX_NUM; ++i)
{
//int randNumOne = int((((double)rand()/RAND_MAX) * MAX_NUM) + 1) % MAX_NUM;
//int randNumTwo = int((((double)rand()/RAND_MAX) * MAX_NUM) + 1) % MAX_NUM;
int randNumOne = (rand() * RAND_MAX + rand()) % MAX_NUM;
int randNumTwo = (rand() * RAND_MAX + rand()) % MAX_NUM;
swap(arr[randNumOne], arr[randNumTwo]);
}
for (unsigned i = 1; i <= num; ++i)
{
outFile << arr[i] << " ";
}
outFile.close();
delete arr;
}
void InitNumFile_2(const char *fileName, unsigned num, unsigned MAX_NUM)
{
ofstream outFile(fileName);
assert(outFile);
srand(time(NULL));
// 记录最大的数
// int max = 0;
//cout << RAND_MAX << endl;
for(unsigned i = 0; i < num; ++i)
{
/*int tmp = rand();
tmp = tmp % MAX_NUM;*/
int randNum = (((double)rand()/RAND_MAX) * MAX_NUM);
randNum = randNum % MAX_NUM;
/*if (randNum > max)
{
max = randNum;
}*/
outFile << randNum << " ";
}
outFile.close();
//cout << max << endl;
}
//
void DiskSortUsingBitSet(const char *fileName, const char *outFileName)
{
clock_t start = clock();
// 1.构造bitset
bitset bits(0);
fstream infile(fileName);
ofstream outfile(outFileName);
assert(infile && outfile);
int num;
while (infile >> num)
{
if (num < MAX_BIT_SIZE)
{
bits[num] = 1;
}
}
for (unsigned i = 0; i < MAX_BIT_SIZE; ++i)
{
if (bits[i] == 1)
{
outfile << i << " ";
}
}
infile.close();
infile.open(fileName);
//infile.seekp(0, ios::beg);
//infile.seekg(0, ios::beg);
bits.reset();
while (infile >> num)
{
if (num >= MAX_BIT_SIZE && num < 2 * MAX_BIT_SIZE)
{
bits[num - MAX_BIT_SIZE] = 1;
}
}
for (unsigned i = 0; i < MAX_BIT_SIZE; ++i)
{
if (bits[i] == 1)
{
outfile << (i + MAX_BIT_SIZE) << " ";
}
}
infile.close();
outfile.close();
clock_t end = clock();
int second = (end - start) / CLOCKS_PER_SEC;
cout << "一共用时: " << second << " s" << endl;
}
/*
1.读取数据并分割成长度为M的小文件
2.对大小为M的文件进行内存排序
3.对大小为M的文件(最后一个可能小于M)进行多路归并排序
每个小文件的命名规则:data1、data2、data3...
*/
void DiskSortUsingKMerge(const char *fileName, const char *outFileName)
{
// 1.分割文件
unsigned fileNum;
DivideFile(fileName, fileNum);
// 2.对每个文件进行排序
for (unsigned i = 1; i <= fileNum; ++i)
{
string curFileName = MakeFileName(PRE_FILE_NAME, i);
SortFile(curFileName.c_str());
}
// 用于保存每个文件的读取指针
// ifstream file[]
//fstream **everyFile = new fstream* [fileNum];
//fstream *everyFile = new fstream [fileNum];
//FILE **farray = new FILE*[fileNum];
fstream *farray = new fstream [fileNum];
int tmp = 0;
// vector everyFile;
for (unsigned i = 1; i <= fileNum; ++i)
{
string curFileName = MakeFileName(PRE_FILE_NAME, i);
// fstream 直接赋值不行!!!
farray[i].open(curFileName, ios::in | ios::out);
//fstream file(curFileName);
//FILE *file = fopen(curFileName.c_str(), "rt");
//fstream file(curFileName);
//assert(file);
//farray[i - 1] = file;
//everyFile[i - 1] = &file;
//everyFile[i - 1] = file;
//(*(everyFile[0])) >> tmp;
//(*(everyFile[i - 1])) >> tmp;
//*everyFile[0] >> tmp;
//*everyFile[1] >> tmp;
//*everyFile[i] >> tmp;
}
// test
KMergeSort(outFileName, farray, fileNum);
//for (unsigned i = 1; i <= fileNum; ++i)
//{
// if (everyFile[i - 1]->is_open())
// {
// everyFile[i - 1]->close();
// }
//}
delete []farray;
}
string MakeFileName(string file, unsigned i)
{
string fileName = file;
char buffer[20];
itoa(i, buffer, 10);
fileName.append(buffer);
fileName += ".txt";
return fileName;
}
//void KMergeSort(const char *fileName, FILE **everyFile, unsigned fileNum)
void KMergeSort(const char *fileName, fstream *everyFile, unsigned fileNum)
{
fstream outFile(fileName);
assert(outFile);
// 用于保存每个文件是否还有数据
bool *isFileLive = new bool[fileNum];
// 用于保存每个文件的第一个数字(最小数)
int *everyNum = new int[fileNum];
for (unsigned i = 0; i < fileNum; ++i)
{
isFileLive[i] = true;
int tmp;
//(*(everyFile[i])) >> tmp;
//everyFile[i] >> tmp;
//fscanf(everyFile[i], "%d", &tmp);
everyFile[i] >> tmp;
//*everyFile[i] >> tmp;
everyNum[i] = tmp;
}
int liveFile = fileNum;
unsigned index = 0;
while (liveFile > 0)
{
FindMin(everyNum, isFileLive, fileNum, index);
outFile << everyNum[index] << " ";
if (isFileLive[index])
{
int tmp;
//if (fscanf(everyFile[index], "%d", &tmp) != EOF)
if (everyFile[index] >> tmp)
{
everyNum[index] = tmp;
}
else
{
isFileLive[index] = false;
liveFile--;
}
}
}
delete []isFileLive;
delete []everyNum;
outFile.close();
}
void FindMin(int *arr, bool *isAlive, unsigned size, unsigned &index)
{
int tmp;
for (unsigned i = 0; i < size; ++i)
{
if (isAlive[i])
{
tmp = arr[i];
index = i;
break;
}
}
for (unsigned i = index + 1; i < size; ++i)
{
if (isAlive[i] && (arr[i] < tmp))
{
tmp = arr[i];
index = i;
}
}
}
// 每个小文件的命名规则:data1、data2、data3...
void DivideFile(const char *fileName, unsigned &fileNum)
{
// 当前小文件数
unsigned curFileNum = 1;
// 当前数字的个数
unsigned curNumCount = 0;
string curFileName = MakeFileName(PRE_FILE_NAME, 1);
ifstream infile(fileName);
ofstream outfile(curFileName);
assert(infile);
assert(outfile);
int num;
while (infile >> num)
{
curNumCount++;
outfile << num << " ";
// 如果到达M个,则需要关闭当前文件,另开一个新的文件
if (curNumCount % MAX_LITTLE_FILE_LENGTH == 0)
{
outfile.close();
curFileNum++;
// 构造新的文件名
curFileName = MakeFileName(PRE_FILE_NAME, curFileNum);
outfile.open(curFileName);
assert(outfile);
}
}
outfile.close();
// 如果最后一个文件一个数字也没有
if (curNumCount % MAX_LITTLE_FILE_LENGTH == 0)
{
curFileNum--;
_unlink(curFileName.c_str());
}
fileNum = curFileNum;
}
void SortFile(const char *fileName)
{
// 要不直接开一个大数组,用sort完事, 在堆上分配
int *arr = new int[MAX_LITTLE_FILE_LENGTH];
fstream file(fileName);
assert(file);
int num;
unsigned index = 0;
while (file >> num)
{
arr[index++] = num;
assert(index <= MAX_LITTLE_FILE_LENGTH);
}
qsort(arr, index, sizeof(int), CompareInt);
// file.seekg(0, ios::beg);
file.close();
file.open(fileName);
assert(file);
for (unsigned i = 0; i < index; ++i)
{
file << arr[i] << " ";
}
file.close();
delete arr;
}
int CompareInt(const void *left, const void * right)
{
return (*(int *)left - *(int *)right);
}