外部排序

  外部排序针对排序文件的大小大于可用内存的情况。如对4GB文件进行排序,可用内存1G,显然没法直接装入内存进行排序。所以我们将这4GB的文件分割为4个1GB的文件进行排序。外部排序的一般步骤为:

  • 1. 对分割后的小文件进行内部排序,将排好序的数据写入到4个不同文件中。
  • 2. 使用4路归并排序的方法对4个小文件进行排序。(这里仅仅使用长度为4的最小优先级队列)
  • 3. 每次读入4个文件中的一个数放入最小优先级队列中,则这个最小优先级队列中的起始元素就为4GB文件中最小的一个。
  • 4. 每次从优先级队列中取出起始元素,写入到输出文件中,然后从一个关联的文件中读取下一个元素插入到最小优先级队列中。
  • 5. 重复第四步,直到所有小文件读取完毕。

  在下面的代码中用最小堆的方式实现了最小优先级队列。仅仅对已经排好序的4个文件(1.txt, 2.txt, 3.txt, 4.txt)每行一个整数进行操作,最后得到输出文件(data.out)即为排好序的数据文件。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

struct entry_s {
    int num;
    FILE *fp;
};

struct prior_queue_s {
    struct entry_s *heap;
    int size;
    int tail;
};

typedef struct prior_queue_s prior_queue_t;
typedef struct entry_s entry_t;

#define INT_MAX (~(1<<31))

void prior_queue_init(prior_queue_t *pq, int size)
{
    pq->heap = malloc(sizeof(entry_t) * size);
    assert(pq->heap != NULL);
    pq->size = size;
    pq->tail = -1;

    int i;
    entry_t *heap = pq->heap;
    for (i = 0; i < size; i++) {
        heap[i].num = INT_MAX;
        heap[i].fp = NULL;
    }
}

static void shift_down(prior_queue_t *pq, int i)
{
    int l = 2 * i + 1;
    int r = 2 * i + 2;
    int small = -1;
    entry_t *heap = pq->heap; 

    if (l <= pq->tail && (heap[l].num < heap[i].num)) {
        small = l;
    }
    else {
        small = i;
    }

    if (r <= pq->tail && (heap[r].num < heap[small].num)) {
        small = r;
    }

    if (small != i) {
        entry_t entry = heap[i];
        heap[i] = heap[small];
        heap[small] = entry;
        shift_down(pq, small);
    }
}

static void shift_up(prior_queue_t *pq, int i)
{
    int p = i / 2;
    int small = -1;
    entry_t *heap = pq->heap;

    if (p >= 0 && (heap[p].num > heap[i].num)) {
        entry_t entry = heap[i];
        heap[i] = heap[p];
        heap[p] = entry;
        shift_up(pq, p);
    }
}

entry_t prior_queue_minimum(prior_queue_t *pq)
{
    return pq->heap[0];
}

entry_t prior_queue_extract_min(prior_queue_t *pq)
{
    entry_t ret= (pq->heap)[0];
    (pq->heap)[0] = (pq->heap)[pq->tail];
    pq->tail--;
    shift_down(pq, 0);
    return ret; 
}

void prior_queue_insert(prior_queue_t *pq, entry_t x)
{
    pq->tail++;
    assert(pq->tail < pq->size);
    (pq->heap)[pq->tail] = x;
    shift_up(pq, pq->tail);
}

void prior_queue_change_key(prior_queue_t *pq, int i, entry_t k)
{
    entry_t *heap = pq->heap;
    if (heap[i].num > k.num) {
        heap[i] = k;
        shift_up(pq, i);
    }
    else {
        heap[i] = k;
        shift_down(pq, i);
    }
}

void prior_queue_print(prior_queue_t *pq)
{
    int i;

    printf("size: %d\n", pq->size);
    printf("tail: %d\n", pq->tail);

    printf("data: ");
    for (i = 0; i <= pq->tail; i++) {
        printf("%-4d", (pq->heap)[i].num);
    }
    printf("\n");
}


int readnum(FILE *fp)
{
    char array[128] = {0};
    char *ret;

    ret = fgets(array, 128, fp);
    if (ret == NULL) {
        return INT_MAX;
    }
    int i = atoi(array);
    return i;
}

void writenum(FILE *fp, int num)
{
    char i[128] = {0};
    const char *p = i;

    sprintf(i, "%d\n", num);
    fwrite(p, (size_t)strlen(i),  1, fp);
}
#define NFILES 4

int main()
{
    prior_queue_t pq;

    prior_queue_init(&pq, 4);

    int i;
    char file[128] = {0};
    char tmp[8] = {0};

    for (i = 0; i < NFILES; i++) {
        memset(file, 0, 128);
        memset(tmp, 0, 8);
        sprintf(tmp, "%d", i+1);
        strcat(file, tmp); 
        strcat(file, ".txt");
        FILE *fp = fopen(file, "r");
        int num  = readnum(fp);
        entry_t entry = {num, fp};
        prior_queue_insert(&pq, entry);
    }

    FILE *outfile = fopen("11.04_data.out", "w");
    entry_t entry = prior_queue_minimum(&pq);
    entry_t new_entry;
    while (entry.num < INT_MAX) {
        FILE *fp = entry.fp;
        int num  = entry.num;
        writenum(outfile, num);
        num = readnum(fp);
        new_entry.num = num;
        new_entry.fp  = fp;
        prior_queue_change_key(&pq, 0, new_entry);
        entry = prior_queue_minimum(&pq);
    }

    return 0;
}

 

你可能感兴趣的:(排序)