前边提到了单线程的实现,这里贴出多线程版,此处主要用多线程去处理hash后的小文件:
package com.kingdee.gmis.mass.data.ips; import static com.kingdee.gmis.mass.data.ips.MassIP.K10; import static com.kingdee.gmis.mass.data.ips.MassIP.getPartitionFile; import static com.kingdee.gmis.mass.data.ips.MassIP.partationCount; import static com.kingdee.gmis.mass.data.ips.MassIP.printResult; import static com.kingdee.gmis.mass.data.ips.MassIP.*; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import com.kingdee.gmis.common.TopNHeap; import com.kingdee.gmis.mass.data.ips.MassIP.IPInfo; public class ConcurrentMassIP { private static int workerCnt = 3; /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // generateMassIp("ip", "ips.txt", 100000000); // generatePartitionFile("ip", "ips.txt", 100); searchTopN(20); } private static AtomicInteger curIdx = new AtomicInteger(-1); static File[] smallFiles; static TopNHeap<IPInfo> destHeap; /** * 查找出现次数最多的K个ip地址 * * @param count * @throws Exception */ public static void searchTopN(int count) throws Exception { Thread.sleep(10000); long start = System.currentTimeMillis(); smallFiles = getPartitionFile("ip", partationCount); destHeap = new TopNHeap<MassIP.IPInfo>(count); int cnt = workerCnt; synchronized (lock) { for (int i = 0; i < cnt; i++) { new Thread(new Worker(i, count)).start(); } lock.wait(); printResult(destHeap); } System.out.println("Total spend " + (System.currentTimeMillis() - start) + " ms"); } static Object lock = new Object(); public static synchronized void mergeToResult(TopNHeap<IPInfo> srcHeap) { try { destHeap.mergeHeap(srcHeap); } finally { if (--workerCnt == 0) { synchronized (lock) { lock.notify(); } } } } static class Worker implements Runnable { private int topCount; private int id; public Worker(int id, int count) { this.id = id; this.topCount = count; } @Override public void run() { int curFileIdx; DataInputStream dis = null; Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>(); TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(topCount); int processCnt = 0; while ((curFileIdx = curIdx.addAndGet(1)) < partationCount) { processCnt++; ipCountMap.clear(); try { dis = new DataInputStream(new BufferedInputStream( new FileInputStream(smallFiles[curFileIdx]), K10)); while (dis.available() > 0) { int ip = dis.readInt(); Integer cnt = ipCountMap.get(ip); ipCountMap.put(ip, cnt == null ? 1 : cnt + 1); } searchMostCountIps(ipCountMap, heap); } catch (Exception e) { throw new RuntimeException(e); } finally { if (dis != null) { try { dis.close(); } catch (IOException e) { e.printStackTrace(); } } } } System.out.println("Thread " + this.id + " process " + processCnt + " files."); mergeToResult(heap); } } }
测试了下,3线程是120s左右,串行是320s左右,的确提高了很多。测试机子是4核cpu,如果启4个线程,机器都变得很卡,cpu居高不下。另jvm启动参数为:
-server -Xmx1024m -Xms1024m -Xmn600m -XX:+UseConcMarkSweepGC -XX:ParallelGCThreads=4
因为此处理过程中,大多数对象存活周期并不长,所以可以把新生代设置大一些。堆初始化设置大些,避免minor gc的时候才去扩展内存大小,因为可以预料到程序一旦启动,加载的内存的东西就会很多。
另外,此处垃圾收集器是用了cms,老生代内存回收就用了cms,新生代用了PurNew,并行收集的,设置gc线程数等于cpu内核数。当然这里也可以设置成-XX:+UseParallelGC、-XX:+UseParallelOldGC都可以,此处主要是新生代内存回收频繁,所以一定要把新生代设置成并发或并行版本的。
通过-server把虚拟机启动为server模式,这样运行时候会启用c2级别的JIT优化,能获得更高质量的编译代码。当然server模式下启动的jvm,默认使用的gc收集器跟-XX:UseParallelGC使用的一样