//生成数据文件 void generateTestData(int len) { FILE* file = fopen(inputFileName, "w"); srand((unsigned int)time(0)); for (int i = 0; i < len; i++) { int val = rand(); fprintf(file, "%d ", val); } }
//hash函数 unsigned int hashFunction(unsigned int key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; }
文件分割函数(该函数效率很低!)
说明:根据数值内容生成100以内的HASH值,以HASH值为名创建文件。
若 hashFunction(key) = hashval,那么key就放在该HASH文件中。
void seperate_file() { FILE* inputFile = fopen(inputFileName.c_str(), "r"); char seperatedName[MAX_FILE_LEN]; memset(seperatedName, 0, MAX_FILE_LEN); while (!feof(inputFile)) { int val; fscanf_s(inputFile, "%d", &val); unsigned int hashKey = hashFunction(val); hashKey %= 100; memset(seperatedName, 0, MAX_FILE_LEN); _itoa(hashKey, seperatedName, 10); filenames.insert(seperatedName); FILE* splitFile = fopen(seperatedName, "a+"); fprintf(splitFile, "%d ", val); fclose(splitFile); } }
此处主要讨论TopK算法:
TopK int* keys = new int[K]; int* vals = new int[K]; count = 0; while (!eof()) read key, val if count < k then keys[count] = key vals[count] = val if count = K - 1 then buildHeap(keys, vals, 0, count) //根据val建最小堆,keys随vals调整 end if else if val > vals[0] then //替换堆顶,并调整堆 vals[0] = val keys[0] = key adjustHeap(keys,vals,0,K-1) end if end if count++ end while
数值 出现次数 452 195 513 196 653 198 608 196 603 198 63 199 575 199 123 204 1005 203 23 198
注:文件切割的功能太简单,也太低效~ 需要更好的办法
附:建堆以及调整堆代码
/*data[p+1......r]都符合小顶堆,只有data[p]不符合,进行调整*/ void adjustHeap(int* keys, int* vals, int p, int r) { if (p == r) return; int val = vals[p]; int key = keys[p]; int curIndex = p; int childIndex = 2*p; for ( ; childIndex <=r; childIndex = childIndex*2) { int leftChildVal = vals[childIndex]; if (childIndex+1 <= r) { int rightChildVal = vals[childIndex + 1]; if (rightChildVal < leftChildVal)childIndex = childIndex + 1; } if (val <= vals[childIndex])break; vals[childIndex/2] = vals[childIndex]; keys[childIndex/2] = keys[childIndex]; } vals[childIndex/2] = val; keys[childIndex/2] = key; } //建小顶堆 void buildHeap(int* keys, int* vals, int heapSize) { for (int i = heapSize/2; i >=0; i--) adjustHeap(keys, vals, i, heapSize-1); }
http://m.oschina.net/blog/74147