程序员编程艺术:第三章续、Top K算法问题的实现
是的,的确是一样,但这个寻找最大的k个数的问题的实用范围更广,因为它牵扯到了一个Top K算法问题,以及有关搜索引擎,海量数据处理等广泛的问题,所以本文特意对这个Top K算法问题,进行阐述以及实现(侧重实现,因为那样看起来,会更令人激动人心),算是第三章的续。ok,有任何问题,欢迎随时不吝指正。谢谢。
寻找最小的k个数,实现一(下段代码经本文评论下多位读者指出有问题:当a [ i ]=a [ j ]=pivot时,则会产生一个无限循环,在Mark Allen Weiss的数据结构与算法分析C++描述中文版的P209-P210有描述,读者可参看之。特此说明,因本文代码存在问题的地方还有几处,故请待后续统一修正.2012.08.21):
- #include
- #include
- using namespace std;
- int my_rand(int low, int high)
- {
- int size = high - low + 1;
- return low + rand() % size;
- }
- int q_select(int a[], int k, int left, int right)
- {
- if(k > right || k < left)
- {
- return false;
- }
- int midIndex = (left + right) / 2;
- if(a[left] < a[midIndex])
- swap(a[left], a[midIndex]);
- if(a[right] < a[midIndex])
- swap(a[right], a[midIndex]);
- if(a[right] < a[left])
- swap(a[right], a[left]);
- swap(a[left], a[right]);
- int pivot = a[right];
- int i = left;
- int j = right-1;
- for (;;)
- {
- while(a[i] < pivot)
- i++;
- while(a[j] > pivot)
- j--;
- if (i < j)
- swap(a[i], a[j]);
- else
- break;
- }
- swap(a[i], a[right]);
- if (i == k)
- return true;
- else if (i > k)
- return q_select(a, k, left, i-1);
- else return q_select(a, k, i+1, right);
- }
- int main()
- {
- int i;
- int a[] = {7, 8, 9, 54, 6, 4, 11, 1, 2, 33};
- q_select(a, 4, 0, sizeof(a) / sizeof(int) - 1);
- return 0;
- }
- #include
- using namespace std;
- const int numOfArray = 10;
- int my_rand(int low, int high)
- {
- int size = high - low + 1;
- return low + rand() % size;
- }
- int partition(int array[], int left, int right)
- {
- int pos = right;
- for(int index = right - 1; index >= left; index--)
- {
- if(array[index] > array[right])
- swap(array[--pos], array[index]);
- }
- swap(array[pos], array[right]);
- return pos;
- }
- int random_partition(int array[], int left, int right)
- {
- int index = my_rand(left, right);
- swap(array[right], array[index]);
- return partition(array, left, right);
- }
- int random_select(int array[], int left, int right, int k)
- {
- if (k < 1 || k > (right - left + 1))
- return -1;
- int pos = random_partition(array, left, right);
- int m = pos - left + 1;
- if(m == k)
- return array[pos];
- else if (m > k)
- return random_select(array, left, pos - 1, k);
- else
- return random_select(array, pos + 1, right, k - m);
- }
- int main()
- {
- int array[numOfArray] = {7, 8, 9, 54, 6, 4, 2, 1, 12, 33};
- cout << random_select(array, 0, numOfArray - 1, 4) << endl;
- return 0;
- }
- #include
- #include
- using namespace std;
- int kth_elem(int a[], int low, int high, int k)
- {
- int pivot = a[low];
- int low_temp = low;
- int high_temp = high;
- while(low < high)
- {
- while(low < high && a[high] >= pivot)
- --high;
- a[low] = a[high];
- while(low < high && a[low] < pivot)
- ++low;
- a[high] = a[low];
- }
- a[low] = pivot;
- if(low == k - 1)
- return a[low];
- else if(low > k - 1)
- return kth_elem(a, low_temp, low - 1, k);
- else
- return kth_elem(a, low + 1, high_temp, k);
- }
- int main()
- {
- for (int num = 5000; num < 50000001; num *= 10)
- {
- int *array = new int[num];
- int j = num / 10;
- int acc = 0;
- for (int k = 1; k <= num; k += j)
- {
- srand(unsigned(time(0)));
- for(int i = 0; i < num; i++)
- array[i] = rand() * RAND_MAX + rand();
- clock_t start = clock();
- int data = kth_elem(array, 0, num - 1, k);
- clock_t end = clock();
- acc += (end - start);
- }
- cout << "The average time of searching a date in the array size of " << num << " is " << acc / 10 << endl;
- }
- return 0;
- }
The average time of searching a date in the array size of 5000 is 0
The average time of searching a date in the array size of 50000 is 1
The average time of searching a date in the array size of 500000 is 12
The average time of searching a date in the array size of 5000000 is 114
The average time of searching a date in the array size of 50000000 is 1159
Press any key to continue
哈哈,且看1、@well:根据上面的运行结果不能判断线性,如果人家是O(n^1.1) 也有可能啊,而且部分数据始终是拟合,还是要数学证明才可靠。2、@July:同时,随机数组中选取一个元素作为枢纽元!=> 随机数组中随机选取一个元素作为枢纽元(如果是随机选取随机数组中的一个元素作为主元,那就不同了,跟随机选取数组中一个元素作为枢纽元一样了)。3、@飞羽:正是因为数组本身是随机的,所以选择第一个元素和随机选择其它的数是等价的(由等概率产生保证),这第3点,我与飞羽有分歧,至于谁对谁错,待时间让我考证。
- 我们说输入一个数组的元素,不按其顺序输入:如,1,2,3,4,5,6,7,而是这样输入:5,7,6,4,3,1,2,这就叫随机输入,而这种情况就相当于上述程序主函数中所产生的随机数组。然而选取随机输入的数组或随机数组中第一个元素作为主元,我们不能称之为说是随机选取枢纽元。
- 因为,随机数产生器产生的数据是随机的,没错,但你要知道,你总是选取随机数组的第一个元素作为枢纽元,这不叫随机选取枢纽元。
- 所以,上述程序的主函数中随机产生的数组对这个程序的算法而言,没有任何意义,就是帮忙产生了一个随机数组,帮助我们完成了测试,且方便我们测试大数据量而已,就这么简单。
- 且一般来说,我们看一个程序的 时间复杂度,是不考虑 其输入情况的,即不考虑主函数,正如这个 kth number 的程序所见,你每次都是随机选取数组中第一个元素作为枢纽元,而并不是随机选择枢纽元,所以,做不到平均时间复杂度为O(N)。
- #include
- #include
- using namespace std;
- inline int my_rand(int low, int high)
- {
- int size = high - low + 1;
- return low + rand() % size;
- }
- int partition(int array[], int left, int right)
- {
- int pivot = array[right];
- int pos = left-1;
- for(int index = left; index < right; index++)
- {
- if(array[index] <= pivot)
- swap(array[++pos], array[index]);
- }
- swap(array[++pos], array[right]);
- return pos;
- }
- bool median_select(int array[], int left, int right, int k)
- {
- if (k-1 > right || k-1 < left)
- return false;
- int midIndex=(left+right)/2;
- if(array[left]
- swap(array[left],array[midIndex]);
- if(array[right]
- swap(array[right],array[midIndex]);
- if(array[right]
- swap(array[right],array[left]);
- swap(array[left], array[right]);
- int pos = partition(array, left, right);
- if (pos == k-1)
- return true;
- else if (pos > k-1)
- return median_select(array, left, pos-1, k);
- else return median_select(array, pos+1, right, k);
- }
- bool rand_select(int array[], int left, int right, int k)
- {
- if (k-1 > right || k-1 < left)
- return false;
- int Index = my_rand(left, right);
- swap(array[Index], array[right]);
- int pos = partition(array, left, right);
- if (pos == k-1)
- return true;
- else if (pos > k-1)
- return rand_select(array, left, pos-1, k);
- else return rand_select(array, pos+1, right, k);
- }
- bool kth_select(int array[], int left, int right, int k)
- {
- if (k-1 > right || k-1 < left)
- return false;
- int pos = partition(array, left, right);
- if(pos == k-1)
- return true;
- else if(pos > k-1)
- return kth_select(array, left, pos-1, k);
- else return kth_select(array, pos+1, right, k);
- }
- int main()
- {
- int array1[] = {7, 8, 9, 54, 6, 4, 11, 1, 2, 33};
- int array2[] = {7, 8, 9, 54, 6, 4, 11, 1, 2, 33};
- int array3[] = {7, 8, 9, 54, 6, 4, 11, 1, 2, 33};
- int numOfArray = sizeof(array1) / sizeof(int);
- for(int i=0; i
- printf("%d/t",array1[i]);
- int K = 9;
- bool flag1 = median_select(array1, 0, numOfArray-1, K);
- bool flag2 = rand_select(array2, 0, numOfArray-1, K);
- bool flag3 = kth_select(array3, 0, numOfArray-1, K);
- if(!flag1)
- return 1;
- for(i=0; i
- printf("%d/t",array1[i]);
- printf("/n");
- if(!flag2)
- return 1;
- for(i=0; i
- printf("%d/t",array2[i]);
- printf("/n");
- if(!flag3)
- return 1;
- for(i=0; i
- printf("%d/t",array3[i]);
- printf("/n");
- return 0;
- }
7 8 9 54 6 4 11 1 2 33
4 1 2 6 7 8 9 11 33
7 6 4 1 2 8 9 11 33
7 8 9 6 4 11 1 2 33
Press any key to continue
2、排序,选择排序。用选择或交换排序,即遍历n个数,先把最先遍历到得k个数存入大小为k的数组之中,对这k个数,利用选择或交换排序,找到k个数中的最小数kmin(kmin设为k个元素的数组中最小元素),用时O(k)(你应该知道,插入或选择排序查找操作需要O(k)的时间),后再继续遍历后n-k个数,x与kmin比较:如果x>kmin,则x代替kmin,并再次重新找出k个元素的数组中最大元素kmin‘(多谢jiyeyuran 提醒修正);如果x
4、按编程之美第141页上解法二的所述,类似快速排序的划分方法,N个数存储在数组S中,再从数组中随机选取一个数X,把数组划分为Sa和Sb俩部分,Sa>=X>=Sb,如果要查找的k个元素小于Sa的元素个数,则返回Sa中较大的k个元素,否则返回Sa中所有的元素+Sb中最大的k-|Sa|个元素。不断递归下去,把问题分解成更小的问题,平均时间复杂度为O(N)(编程之美所述的n*logk的复杂度有误,应为O(N),特此订正。其严格证明,请参考第三章:程序员面试题狂想曲:第三章、寻找最小的k个数、updated 10次)。
第三节、Top K 算法问题
第二步、借助堆这个数据结构,找出Top K,时间复杂度为N‘logK。
即,借助堆结构,我们可以在log量级的时间内查找和调整/移动。因此,维护一个K(该题目中是10)大小的小根堆(K1>K2>....Kmin,Kmin设为堆顶元素),然后遍历300万的Query,分别和根元素Kmin进行对比比较(如上第2节思路3所述,若X>Kmin,则更新并调整堆,否则,不更新),我们最终的时间复杂度是:O(N) + N'*O(logK),(N为1000万,N’为300万)。ok,更多,详情,请参考原文。

- #include
- #include
- #include
- using namespace std;
- #define HASHLEN 2807303
- #define WORDLEN 30
- typedef struct node_no_space *ptr_no_space;
- typedef struct node_has_space *ptr_has_space;
- ptr_no_space head[HASHLEN];
- struct node_no_space
- {
- char *word;
- int count;
- ptr_no_space next;
- };
- struct node_has_space
- {
- char word[WORDLEN];
- int count;
- ptr_has_space next;
- };
- int hash_function(char const *p)
- {
- int value = 0;
- while (*p != '/0')
- {
- value = value * 31 + *p++;
- if (value > HASHLEN)
- value = value % HASHLEN;
- }
- return value;
- }
- void append_word(char const *str)
- {
- int index = hash_function(str);
- ptr_no_space p = head[index];
- while (p != NULL)
- {
- if (strcmp(str, p->word) == 0)
- {
- (p->count)++;
- return;
- }
- p = p->next;
- }
- ptr_no_space q = new node_no_space;
- q->count = 1;
- q->word = new char [strlen(str)+1];
- strcpy(q->word, str);
- q->next = head[index];
- head[index] = q;
- }
- void write_to_file()
- {
- FILE *fp = fopen("result.txt", "w");
- assert(fp);
- int i = 0;
- while (i < HASHLEN)
- {
- for (ptr_no_space p = head[i]; p != NULL; p = p->next)
- fprintf(fp, "%s %d/n", p->word, p->count);
- i++;
- }
- fclose(fp);
- }
- void sift_down(node_has_space heap[], int i, int len)
- {
- int min_index = -1;
- int left = 2 * i;
- int right = 2 * i + 1;
- if (left <= len && heap[left].count < heap[i].count)
- min_index = left;
- else
- min_index = i;
- if (right <= len && heap[right].count < heap[min_index].count)
- min_index = right;
- if (min_index != i)
- {
- swap(heap[i].count, heap[min_index].count);
- char buffer[WORDLEN];
- strcpy(buffer, heap[i].word);
- strcpy(heap[i].word, heap[min_index].word);
- strcpy(heap[min_index].word, buffer);
- sift_down(heap, min_index, len);
- }
- }
- void build_min_heap(node_has_space heap[], int len)
- {
- if (heap == NULL)
- return;
- int index = len / 2;
- for (int i = index; i >= 1; i--)
- sift_down(heap, i, len);
- }
- void handle_symbol(char *str, int n)
- {
- while (str[n] < '0' || (str[n] > '9' && str[n] < 'A') || (str[n] > 'Z' && str[n] < 'a') || str[n] > 'z')
- {
- str[n] = '/0';
- n--;
- }
- while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z')
- {
- int i = 0;
- while (i < n)
- {
- str[i] = str[i+1];
- i++;
- }
- str[i] = '/0';
- n--;
- }
- }
- int main()
- {
- char str[WORDLEN];
- for (int i = 0; i < HASHLEN; i++)
- head[i] = NULL;
- FILE *fp_passage = fopen("string.txt", "r");
- assert(fp_passage);
- while (fscanf(fp_passage, "%s", str) != EOF)
- {
- int n = strlen(str) - 1;
- if (n > 0)
- handle_symbol(str, n);
- append_word(str);
- }
- fclose(fp_passage);
- write_to_file();
- int n = 10;
- ptr_has_space heap = new node_has_space [n+1];
- int c;
- FILE *fp_word = fopen("result.txt", "r");
- assert(fp_word);
- for (int j = 1; j <= n; j++)
- {
- fscanf(fp_word, "%s %d", &str, &c);
- heap[j].count = c;
- strcpy(heap[j].word, str);
- }
- build_min_heap(heap, n);
- while (fscanf(fp_word, "%s %d", &str, &c) != EOF)
- {
- if (c > heap[1].count)
- {
- heap[1].count = c;
- strcpy(heap[1].word, str);
- sift_down(heap, 1, n);
- }
- }
- fclose(fp_word);
- for (int k = 1; k <= n; k++)
- cout << heap[k].count << " " << heap[k].word << endl;
- return 0;
- }


读者反馈from 杨忠胜:3.1节的代码第38行 hash_function(char const *p)有误吧,这样的话,不能修改p的值(但是函数需要修改指针的值),要想不修改*p指向的内容,应该是const char *p; 此外,您程序中的/t, /n有误,C语言是\t,\n。
同时,正好个人此前用c && c++ 语言实现过红黑树,那么,代码能借用就借用吧。
- #define PARENT(i) (i)/2
- #define LEFT(i) 2*(i)
- #define RIGHT(i) 2*(i)+1
- #include
- #include
- #include
- typedef enum rb_color{ RED, BLACK }RB_COLOR;
- typedef struct rb_node
- {
- int key;
- int data;
- RB_COLOR color;
- struct rb_node* left;
- struct rb_node* right;
- struct rb_node* parent;
- RB_NODE* RB_CreatNode(int key, int data)
- {
- RB_NODE* node = (RB_NODE*)malloc(sizeof(RB_NODE));
- if (NULL == node)
- {
- printf("malloc error!");
- exit(-1);
- }
- node->key = key;
- node->data = data;
- node->color = RED;
- node->left = NULL;
- node->right = NULL;
- node->parent = NULL;
- return node;
- }
- RB_NODE* RB_RotateLeft(RB_NODE* node, RB_NODE* root)
- {
- RB_NODE* right = node->right;
- if ((node->right = right->left))
- right->left->parent = node;
- right->left = node;
- if ((right->parent = node->parent))
- {
- if (node == node->parent->right)
- node->parent->right = right;
- else
- node->parent->left = right;
- }
- else
- root = right;
- node->parent = right;
- return root;
- }
- RB_NODE* RB_RotateRight(RB_NODE* node, RB_NODE* root)
- {
- RB_NODE* left = node->left;
- if ((node->left = left->right))
- left->right->parent = node;
- left->right = node;
- if ((left->parent = node->parent))
- {
- if (node == node->parent->right)
- node->parent->right = left;
- else
- node->parent->left = left;
- }
- else
- root = left;
- node->parent = left;
- return root;
- }
- RB_NODE* RB_Insert_Rebalance(RB_NODE* node, RB_NODE* root)
- {
- RB_NODE *parent, *gparent, *uncle, *tmp;
- while ((parent = node->parent) && parent->color == RED)
- {
- gparent = parent->parent;
- if (parent == gparent->left)
- {
- uncle = gparent->right;
- if (uncle && uncle->color == RED)
- {
- uncle->color = BLACK;
- parent->color = BLACK;
- gparent->color = RED;
- node = gparent;
- }
- else
- {
- if (parent->right == node)
- {
- root = RB_RotateLeft(parent, root);
- tmp = parent;
- parent = node;
- node = tmp;
- }
- parent->color = BLACK;
- gparent->color = RED;
- root = RB_RotateRight(gparent, root);
- }
- }
- else
- {
- uncle = gparent->left;
- if (uncle && uncle->color == RED)
- {
- uncle->color = BLACK;
- parent->color = BLACK;
- gparent->color = RED;
- node = gparent;
- }
- else
- {
- if (parent->left == node)
- {
- root = RB_RotateRight(parent, root);
- tmp = parent;
- parent = node;
- node = tmp;
- }
- parent->color = BLACK;
- gparent->color = RED;
- root = RB_RotateLeft(gparent, root);
- }
- }
- }
- root->color = BLACK;
- return root;
- }
- RB_NODE* RB_SearchAuxiliary(int key, RB_NODE* root, RB_NODE** save)
- {
- RB_NODE* node = root;
- RB_NODE* parent = NULL;
- int ret;
- while (node)
- {
- parent = node;
- ret = node->key - key;
- if (0 < ret)
- node = node->left;
- else if (0 > ret)
- node = node->right;
- else
- return node;
- }
- if (save)
- *save = parent;
- return NULL;
- }
- RB_NODE* RB_Search(int key, RB_NODE* root)
- {
- return RB_SearchAuxiliary(key, root, NULL);
- }
- RB_NODE* RB_Insert(int key, int data, RB_NODE* root)
- {
- RB_NODE* parent = NULL;
- RB_NODE* node = NULL;
- parent = NULL;
- if ((node = RB_SearchAuxiliary(key, root, &parent)))
- {
- node->data++;
- return root;
- }
- node = RB_CreatNode(key, data);
- node->parent = parent;
- if (parent)
- {
- if (parent->key > key)
- parent->left = node;
- else
- parent->right = node;
- }
- else
- {
- root = node;
- }
- return RB_Insert_Rebalance(node, root);
- }
- typedef struct rb_heap
- {
- int key;
- int data;
- const int heapSize = 10;
- RB_HEAP heap[heapSize+1];
- void MIN_HEAPIFY(RB_HEAP* A, const int& size, int i)
- {
- int l = LEFT(i);
- int r = RIGHT(i);
- int smallest = i;
- if (l <= size && A[l].data < A[i].data)
- smallest = l;
- if (r <= size && A[r].data < A[smallest].data)
- smallest = r;
- if (smallest != i)
- {
- RB_HEAP tmp = A[i];
- A[i] = A[smallest];
- A[smallest] = tmp;
- MIN_HEAPIFY(A, size, smallest);
- }
- }
- void BUILD_MINHEAP(RB_HEAP* A, const int& size)
- {
- for (int i = size/2; i >= 1; --i)
- MIN_HEAPIFY(A, size, i);
- }
- void InOrderTraverse(RB_NODE* node)
- {
- if (node == NULL)
- {
- return;
- }
- else
- {
- InOrderTraverse(node->left);
- if (node->data > heap[1].data)
- {
- heap[1].data = node->data;
- heap[1].key = node->key;
- MIN_HEAPIFY(heap, heapSize, 1);
- }
- InOrderTraverse(node->right);
- }
- }
- void RB_Destroy(RB_NODE* node)
- {
- if (NULL == node)
- {
- return;
- }
- else
- {
- RB_Destroy(node->left);
- RB_Destroy(node->right);
- free(node);
- node = NULL;
- }
- }
- int main()
- {
- RB_NODE* root = NULL;
- RB_NODE* node = NULL;
- for (int i = 1; i <= 10; ++i)
- {
- heap[i].key = i;
- heap[i].data = -i;
- }
- BUILD_MINHEAP(heap, heapSize);
- FILE* fp = fopen("data.txt", "r");
- int num;
- while (!feof(fp))
- {
- fscanf(fp, "%d", &num);
- root = RB_Insert(num, 1, root);
- }
- fclose(fp);
- InOrderTraverse(root);
- RB_Destroy(root);
- for (i = 1; i <= 10; ++i)
- {
- printf("%d/t%d/n", heap[i].key, heap[i].data);
- }
- return 0;
- }


- #define STACK_SIZE 1000
- typedef struct
- {
- RB_NODE** top;
- RB_NODE** base;
- }*PStack, Stack;
- bool InitStack(PStack& st)
- {
- st->base = (RB_NODE**)malloc(sizeof(RB_NODE*) * STACK_SIZE);
- if (!st->base)
- {
- printf("InitStack error!");
- exit(1);
- }
- st->top = st->base;
- return true;
- }
- bool Push(PStack& st, RB_NODE*& e)
- {
- if (st->top - st->base >= STACK_SIZE)
- return false;
- *st->top = e;
- st->top++;
- return true;
- }
- bool Pop(PStack& st, RB_NODE*& e)
- {
- if (st->top == st->base)
- {
- e = NULL;
- return false;
- }
- e = *--st->top;
- return true;
- }
- bool StackEmpty(PStack& st)
- {
- if (st->base == st->top)
- return true;
- else
- return false;
- }
- bool InOrderTraverse_Stack(RB_NODE*& T)
- {
- PStack S = (PStack)malloc(sizeof(Stack));
- RB_NODE* P = T;
- InitStack(S);
- while (P != NULL || !StackEmpty(S))
- {
- if (P != NULL)
- {
- Push(S, P);
- P = P->left;
- }
- else
- {
- Pop(S, P);
- if (P->data > heap[1].data)
- {
- heap[1].data = P->data;
- heap[1].key = P->key;
- MIN_HEAPIFY(heap, heapSize, 1);
- }
- P = P->right;
- }
- }
- free(S->base);
- S->base = NULL;
- free(S);
- S = NULL;
- return true;
- }
- bool PostOrderTraverse_Stack(RB_NODE*& T)
- {
- PStack S = (PStack)malloc(sizeof(Stack));
- RB_NODE* P = T;
- RB_NODE* Pre = NULL;
- InitStack(S);
- while (P != NULL || !StackEmpty(S))
- {
- if (P != NULL)
- {
- Push(S, P);
- P = P->left;
- }
- else
- {
- Pop(S, P);
- if (P->right == NULL || P->right == Pre)
- {
- free(P);
- Pre = P;
- P = NULL;
- }
- else
- {
- Push(S, P);
- P = P->right;
- }
- }
- }
- free(S->base);
- S->base = NULL;
- free(S);
- S = NULL;
- return true;
- }
- int main()
- {
- RB_NODE* root = NULL;
- RB_NODE* node = NULL;
- for (int i = 1; i <= 10; ++i)
- {
- heap[i].key = i;
- heap[i].data = -i;
- }
- BUILD_MINHEAP(heap, heapSize);
- FILE* fp = fopen("data.txt", "r");
- int num;
- while (!feof(fp))
- {
- fscanf(fp, "%d", &num);
- root = RB_Insert(num, 1, root);
- }
- fclose(fp);
- InOrderTraverse_Stack(root);
- PostOrderTraverse_Stack(root);
- for (i = 1; i <= 10; ++i)
- {
- printf("%d/t%d/n", heap[i].key, heap[i].data);
- }
- return 0;
- }
- #define PARENT(i) (i)/2
- #define LEFT(i) 2*(i)
- #define RIGHT(i) 2*(i)+1
- #define HASHTABLESIZE 2807303
- #define HEAPSIZE 10
- #define A 0.6180339887
- #define M 16384 //m=2^14
- #include
- #include
- typedef struct hash_node
- {
- int data;
- int count;
- struct hash_node* next;
- HASH_NODE* creat_node(int& data)
- {
- HASH_NODE* node = (HASH_NODE*)malloc(sizeof(HASH_NODE));
- if (NULL == node)
- {
- printf("malloc node failed!/n");
- }
- node->data = data;
- node->count = 1;
- node->next = NULL;
- return node;
- }
- int hash_function(int& key)
- {
- double result = A * key;
- return (int)(M * (result - (int)result));
- }
- void insert(int& data)
- {
- int index = hash_function(data);
- HASH_NODE* pnode = hash_table[index];
- while (NULL != pnode)
- {
- if (pnode->data == data)
- {
- pnode->count += 1;
- return;
- }
- pnode = pnode->next;
- }
- pnode = creat_node(data);
- pnode->next = hash_table[index];
- hash_table[index] = pnode;
- }
- void destroy_node()
- {
- HASH_NODE* tmp = NULL;
- for (int i = 0; i < HASHTABLESIZE; ++i)
- {
- p = hash_table[i];
- while (NULL != p)
- {
- tmp = p;
- p = p->next;
- free(tmp);
- tmp = NULL;
- }
- }
- }
- typedef struct min_heap
- {
- int count;
- int data;
- MIN_HEAP heap[HEAPSIZE + 1];
- void min_heapify(MIN_HEAP* H, const int& size, int i)
- {
- int l = LEFT(i);
- int r = RIGHT(i);
- int smallest = i;
- if (l <= size && H[l].count < H[i].count)
- smallest = l;
- if (r <= size && H[r].count < H[smallest].count)
- smallest = r;
- if (smallest != i)
- {
- MIN_HEAP tmp = H[i];
- H[i] = H[smallest];
- H[smallest] = tmp;
- min_heapify(H, size, smallest);
- }
- }
- void build_min_heap(MIN_HEAP* H, const int& size)
- {
- for (int i = size/2; i >= 1; --i)
- min_heapify(H, size, i);
- }
- void traverse_hashtale()
- {
- for (int i = 0; i < HASHTABLESIZE; ++i)
- {
- p = hash_table[i];
- while (NULL != p)
- {
- if (p->count > heap[1].count)
- {
- heap[1].count = p->count;
- heap[1].data = p->data;
- min_heapify(heap, HEAPSIZE, 1);
- }
- p = p->next;
- }
- }
- }
- int main()
- {
- for (int i = 1; i <= 10; ++i)
- {
- heap[i].count = -i;
- heap[i].data = i;
- }
- build_min_heap(heap, HEAPSIZE);
- FILE* fp = fopen("data.txt", "r");
- int num;
- while (!feof(fp))
- {
- fscanf(fp, "%d", &num);
- insert(num);
- }
- fclose(fp);
- traverse_hashtale();
- for (i = 1; i <= 10; ++i)
- {
- printf("%d/t%d/n", heap[i].data, heap[i].count);
- }
- return 0;
- }


关于海量数据处理的问题,一般有Bloom filter,Hashing,bit-map,堆,trie树等方法来处理。更详细的介绍,请查看此文:十道海量数据处理面试题与十个方法大总结。