[原创]从1亿个数据中找出前100个最大值

从一亿个数据中找出前100个最大值

方法一:

> 新建一100个红黑树节点,将输入前100个保存进去,然后全部插入红黑树T

> 遍历剩下的所有输入,对每一个输入值,如果值大于红黑树中最小值,则删除最小值节点,然后修改被删除节点的值为当前输入,然后插入红黑树。

复杂度为n*lg(m), n为输入数据条数,m为输出数据条数

方法二:将红黑树替换成最小堆,每插入一条数据,只需要运行MIN_HEAPIFY即可。

 

实际运行结果来看,最小堆的方法更快。

 

方法一代码如下,红黑树代码参考https://blog.csdn.net/v2nero/article/details/19170987

#include "stdafx.h"
#include 
#include 
#include 
#include "rb.h"
#include 



//#define	outputNum	100
//#define	inputNum	100000000

int main(int arc, char *arv[])
{
	if (arc != 3) {
		printf("head100 input_num output_num\n");
		return 1;
	}
	int inputNum = atoi(arv[1]);
	int outputNum = atoi(arv[2]);
	rb_tree_t T;
	ia_rb_tree_init(&T);

	rb_tree_node_t *nodes = new rb_tree_node_t[outputNum];
	memset(nodes, 0, sizeof(rb_tree_node_t)*outputNum);

	clock_t clock_s = 0, clock_e = 0;
	clock_s = clock();

	for (int i = 0; i < outputNum; ++i) {
		nodes[i].key = 0;
	}

	for (int i = 0; i < inputNum; ++i) {
		if (i < outputNum) {
			nodes[i].key = i;
			ia_rb_tree_insert(&T, &nodes[i]);
		} else {
			rb_tree_node_t *node = ia_rb_tree_minnode(&T);
			if (i > node->key) {
				ia_rb_tree_delete(&T, node);
				node->key = i;
				ia_rb_tree_insert(&T, node);
			}
		}
	}

	clock_e = clock();
	double duration = (clock_e - clock_s)/CLOCKS_PER_SEC;
	printf("\n%2.3f seconds\n", duration);

	//ia_rb_tree_inorder_walk(&T, T.root, NULL);
	delete[] nodes;

	return 0;
}

结果似乎不是特别理想,可能是红黑树操作常量乘积过大吧

[原创]从1亿个数据中找出前100个最大值_第1张图片

CPU: 

Linux -O3编译运行结果

nero@nero-All-Series:~/ws/ia/head100$ ./head100_rb 1000000000 100

45.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_rb 100000000 100

4.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_rb 100000000 1000

6.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_rb 100000000 10000

12.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_rb 100000000 100000

22.000 seconds

方法二代码如下:

#include "common.h"
 
typedef struct _heap_t {
	int length;	//array length
	int size;	//heap size
	int *data;
} heap_t;
 
#define PARENT(i) ((i-1)/2)
#define LEFT(i) (2*i + 1)
#define RIGHT(i) (2*i + 2)
 
 
//MIN-HEAPIFY
void MIN_HEAPIFY(heap_t *A, int i)
{
	int l = LEFT(i);
	int r = RIGHT(i);
	int smallest = 0;
	int tmp = 0;
 
	if (l < A->size &&
	    A->data[l] < A->data[i]) {
		smallest = l;
	} else {
		smallest = i;
	}
 
	if (r < A->size &&
	    A->data[r] < A->data[smallest]) {
		smallest = r;
	}
 
	if (smallest != i) {
		tmp = A->data[smallest];
		A->data[smallest] = A->data[i];
		A->data[i] = tmp;
		MIN_HEAPIFY(A, smallest);
	}
}


int Head100(int arc, const char *arv[]) {
	if (arc != 3) {
		printf("head100 input_num output_num\n");
		return 1;
	}
	int inputNum = atoi(arv[1]);
	int outputNum = atoi(arv[2]);
	heap_t heap;
	heap.size = outputNum;
	heap.length = outputNum;
	heap.data = new int[outputNum];
	//memset(heap.data, 0xff, sizeof(int)*outputNum);
	for (int i = 0; i < outputNum; ++i) {
		heap.data[i] = INT_MIN;
	}
	
	clock_t clock_s = 0, clock_e = 0;
	clock_s = clock();

	for (int i = 0; i < inputNum; ++i) {
		if (i > heap.data[0]) {
			heap.data[0] = i;
			MIN_HEAPIFY(&heap, 0);
		}
	}

	clock_e = clock();
	double duration = (clock_e - clock_s)/CLOCKS_PER_SEC;
	printf("\n%2.3f seconds\n", duration);

#if 0
	for (int i = 0; i < outputNum; ++i) {
		TRACE("%d ", heap.data[i]);
	}
#endif

	delete []heap.data;

	return 0;
}

 
int main(int arc, const char *arv[])
{
	return Head100(arc, arv);	
}

Linux -O3运行速度如下

nero@nero-All-Series:~/ws/ia/head100$ ./head100_heap 100000000 100

2.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_heap 100000000 1000

3.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_heap 100000000 10000

5.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_heap 100000000 100000

6.000 seconds
nero@nero-All-Series:~/ws/ia/head100$ ./head100_heap 1000000000 100

22.000 seconds

 

你可能感兴趣的:(算法导论)