给定1亿个数,找出 最大的 100个
1. 用一个长度是 101 的数组,建立 小顶堆(0号元素不用,主要是为了使用堆的性质:父结点i,则,左右 子结点 是 2i 和 2i+1)
2. 用堆顶 和 每个 取得的数 进行比较。(a. 堆顶 >= 取得的数,则,忽略 取得的数 b. 否则,把堆顶 替换为 取得的数)
3. 新得到的堆, 堆顶 的左右子树 都是 完美堆。需要调整 堆顶(调整算法 就是 构建堆时的 调整算法)
其实,无论是1亿,还是 几亿个,都无所谓,因为,耗费的内存的就是 长度101 的数组。读取1亿个数,就像流式读取文件一样,并不会 完全读入内存中
另外,可以使用 linux命令来产生随机文件,然后,每次读取4个字节,这样,其实就是读取了一个整数,命令是这样的
// 当前目录下,产生 512M 随机数据 二进制文件 dd if=/dev/urandom of=random.dat bs=1M count=512
程序的完整代码如下
import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.FileInputStream; import java.util.Arrays; import com.util.CommUtil; public class TopN { // Find the top n number from more than 100,000,000 numbers public static int[] topN(int n, GenerateInt gi) { // please note: 0 element is ignored int[] arrayTopN = new int[n + 1]; arrayTopN[0] = Integer.MAX_VALUE; int i = 1; while (gi.hasNext() && (i <= arrayTopN.length - 1)) { arrayTopN[i] = gi.next(); i++; } Heap.constructSmallRootHeap(arrayTopN); while (gi.hasNext()) { insertNewNumber(gi.next(), arrayTopN); } return arrayTopN; } private static void insertNewNumber(int newNumber, int[] smallRootHeap) { // too small, just ignore the newNumber if (smallRootHeap[1] >= newNumber) { return; } // newNumber > smallRootHeap[1] smallRootHeap[1] = newNumber; Heap.sinkRoot(1, smallRootHeap); } public static void test_TopN(int n, String fullPath) throws Exception { long begin = System.currentTimeMillis(); FileInputStream fis = new FileInputStream(fullPath); int[] arrayTopN = topN(n, new GenerateInt(fis)); fis.close(); System.out.println(Arrays.toString(arrayTopN)); System.out.printf("TopN cost millionseconds: %d", (System.currentTimeMillis() - begin)); } public static void main(String[] args) throws Exception { String fullPath = "/home/marvin/random.dat"; test_TopN(10, fullPath); } } class Heap { // Please note: 0 element is ignored public static void constructSmallRootHeap(int[] orgnArray) { // 0 element is not included int validElementCount = orgnArray.length - 1; int lastNonleaf = validElementCount / 2; int curIndex = lastNonleaf; while (curIndex >= 1) { sinkRoot(curIndex, orgnArray); curIndex--; } } public static void sinkRoot(int rootIndex, int[] smallRootHeap) { int lastIndex = smallRootHeap.length - 1; int curIndex = rootIndex; int left = 2 * curIndex; int right = left + 1; while (left <= lastIndex) { int minValueIndex = left; // right child exists if (right <= lastIndex) { minValueIndex = (smallRootHeap[left] > smallRootHeap[right]) ? right : left; } // no right child at all else { minValueIndex = left; } // parent is bigger. should down if (smallRootHeap[curIndex] > smallRootHeap[minValueIndex]) { CommUtil.swap(curIndex, minValueIndex, smallRootHeap); curIndex = minValueIndex; left = 2 * curIndex; right = left + 1; } // parent is smaller. good, we've done else { break; } } } public static void main(String[] args) { int[] orgnArray = { Integer.MAX_VALUE, 49, 38, 65, 97, 76, 13, 27, 49 }; System.out.printf("Before: %s\n", Arrays.toString(orgnArray)); constructSmallRootHeap(orgnArray); System.out.printf("After : %s\n", Arrays.toString(orgnArray)); } } class GenerateInt { // To generate a random file, you can use the following linux command // dd if=/dev/urandom of=random.dat bs=1M count=512 private DataInputStream dis; private int i; private long totalCount = 0; public GenerateInt(FileInputStream fis) { try { this.dis = new DataInputStream(new BufferedInputStream(fis)); } catch (Exception e) { throw new RuntimeException(e); } } public boolean hasNext() { try { i = dis.readInt(); totalCount++; } catch (Exception e) { System.out.println("Generate int count: " + totalCount); return false; } return true; } public int next() { return i; } private static void readIntFromFile(String fullPath) throws Exception { FileInputStream fis = new FileInputStream(fullPath); long begin = System.currentTimeMillis(); GenerateInt gi = new GenerateInt(fis); while (gi.hasNext()) { // System.out.println(gi.next()); } System.out.printf("Read int cost: %d", (System.currentTimeMillis() - begin)); fis.close(); } public static void main(String[] args) throws Exception { String randomFile = "/home/marvin/random.dat"; readIntFromFile(randomFile); } }
还有一种是找出 1亿个数中,出现次数最多的 100 个数 ,这种可以通过 trie树来查找