题目:
有一个很大很大的输入流,大到没有存储器可以将其存储下来,而且只输入一次,如何从这个输入
流中随机取得m个记录
分析:
这题在soso面试的时候也见过,当时没有答出来。
现在也每想出好的办法。
网上看到一种方法:每次输入一个记录时,随机产生一个0到1之间的随机数,
用这些随机数维护一个大小为m的堆。但是这种方法是否是后面的输入的纪录被选取的概率要比前面要大,不太合适随即取的要求。
???后续完善。。。
继续完善,谢谢_chen_lin_ 提供的思路。
模拟数据流生成了1-1000000的整数序列,随机在里面抽取100个数。
实现如下:
#include<iostream> #include<stdlib.h> #include<stdio.h> #include<string.h> using namespace std; #define m 100 #define max 1000000 int getdata(FILE* fp)//生成数据流的样本数据 { int i = 1; if(fp != NULL) { while(i< max) { char tmp[100]; memset(tmp, 0, 100); if(i == max -1) sprintf(tmp, "%d", i++); else sprintf(tmp, "%d,", i++); cout << tmp << endl; fwrite(tmp, strlen(tmp), 1, fp); } } return -1; } int main(int argc, char* argv[]) { /* //生产数据流的语句 模拟 FILE* fp = fopen("1.txt", "wb"); getdata(fp); fclose(fp);*/ int i = 0; int a[m]; string l = ""; while(getline(cin, l)) { const char* p1 = l.c_str(); const char* p2 = NULL; char tmp[100]; while(*p1 != '\0') { p2 = p1; while(*p2 != ',' && *p2 != '\0') p2 ++; memset(tmp, 0, 100); strncpy(tmp, p1, p2-p1); int k = atoi(tmp); if(i < m) a[i] = k; else { //rand algritem int l = rand()%(i+1); if(l < m) a[l] = k; } i ++; if(*p2 != '\0') p2 ++; p1 = p2; } } if(i > m) { i = 0; cout << "rand m data:"; while(i < m) cout << a[i++] << ","; cout << endl; } else cout << "have no stream" << endl; return 0; }
输出结果:rand m data:214081,42675,910277,156113,549744,324942,296182,219732,318989,104930,698642,350230,862258,91276,667257,259708,599704,654231,511576,995314,656139,747846,886097,866713,330504,242010,147976,327057,386808,117999,478944,293433,203708,584959,502002,90630,809846,359830,25989,110210,514453,63067,467666,454933,28548,458088,461703,667961,466615,434741,54051,704393,216666,880643,615583,439624,620417,603399,830626,697033,914745,326997,74834,77412,487891,314260,139194,225634,162485,335521,911982,801962,766891,154600,819794,856387,427236,604588,265978,526451,994597,337788,674932,23974,292355,483701,671288,148098,96424,838102,301385,823148,742111,614669,353253,794833,681198,596458,611455,959873,