求集合内两两字符串的编辑距离,先使用暴力方法,之后再介绍用kd树的方法
主线程进行任务分发,将字符串依次放入线程池队列中,线程池内的线程就计算这个字符串和所有字符串的编辑距离,然后将计算结果写入blockqueue中,再起一个线程将blockqueue中的结果写入磁盘中.这里有一个技巧,当处理完所有字符串,写进程可能在将队列中的结果写入到磁盘也可能阻塞在take函数处。主线程就在队列末尾放入一个结束标记,读线程接收到这个标记之后就抛出异常结束了。
import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; import java.util.Scanner; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; public class EditDistance{ ExecutorService service ; List<String> ls; LinkedBlockingQueue<String> outque; class WordDist implements Comparable{ public String word; public int dist; public WordDist(String wd, int d) { word = wd; dist = d; } public int compareTo(Object o) { return dist - ((WordDist)o).dist ; } public String toString () { return word + " " + dist; } } class Process implements Runnable { String name; public Process(String s) { name = s; } public void run() { PriorityQueue<WordDist> que = new PriorityQueue<WordDist>(); for (int j = 0; j < ls.size(); ++j) { if (!name.equals(ls.get(j))) { que.add(new WordDist(ls.get(j), calcEditDist(name, ls.get(j)))); } } StringBuilder sb = new StringBuilder(); sb.append(name+"\t"); int num = 20; int count = 0; while (que.peek() != null && count < num) { sb.append(que.poll() + " "); ++count; } try { outque.put(sb.toString()); } catch (InterruptedException e) { System.out.println(Thread.currentThread().getName() + " " + name); e.printStackTrace(); } } } class Writer implements Runnable { PrintWriter pw; public Writer(String fn) { try { pw = new PrintWriter(fn); } catch (IOException e) { e.printStackTrace(); return; } } @Override public void run() { try { while (true) { String rst = outque.take(); if (rst.equals("!!!POISON")) { throw new InterruptedException("meet a poison. over"); } pw.println(rst); } } catch (InterruptedException e) { pw.close(); System.out.println(e); System.out.println("Writer is over"); } } } public int calcEditDist(String s1, String s2) { int [][]mat = new int[s1.length()+1][s2.length()+1];// mat[0][0] = 0; for (int i = 1; i <= s1.length(); ++i) mat[i][0] = i; for (int i = 1; i <= s2.length(); ++i) mat[0][i] = i; for (int i = 1; i <= s1.length(); ++i) { for (int j = 1; j <= s2.length(); ++j) { int repCost = 1; if (s1.charAt(i-1) == s2.charAt(j - 1)) { repCost = 0; } mat[i][j] = min(mat[i-1][j-1]+repCost, mat[i-1][j] + 1, mat[i][j-1] + 1); } } return mat[s1.length()][s2.length()]; } private int min(int a, int b, int c) { int m = a; if (m > b) m = b; if (m > c) m = c; return m; } public void genCluster(String in, String out) { try { BufferedReader br = new BufferedReader (new InputStreamReader(new FileInputStream (in) ,"utf-8")); //PrintWriter pw = new PrintWriter(out, "utf-8"); service = Executors.newFixedThreadPool(7); String line; ls = new ArrayList<String>(); outque = new LinkedBlockingQueue<String>();// while ((line = br.readLine()) != null) { Scanner scan = new Scanner(line); scan.useDelimiter(" \\|\\|\\| "); ls.add(scan.next()); } br.close(); Thread writer = new Thread(new Writer(out)); writer.start(); for (int i = 0; i < ls.size(); ++i) { service.submit(new Process(ls.get(i))); } service.shutdown();//提交完任务之后就调用shutdown,不能再提交新任务 try { service.awaitTermination(24, TimeUnit.HOURS);//再接着阻塞等待所有任务的结束 } catch (InterruptedException e) { e.printStackTrace(); System.out.println("main await error"); } try { outque.put("!!!POISON"); } catch (InterruptedException e) { e.printStackTrace(); System.out.println("put poison error"); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { // TODO Auto-generated method stub if (args.length != 2) { System.out.println("in out"); return; } EditDistance dist = new EditDistance(); dist.genCluster(args[0], args[1]); } }
service.awaitTermination(24, TimeUnit.HOURS);
这个函数出问题了,java多线程和c有个不同之处在于,只要有一个线程再跑着,只要不是后台线程,即使main线程执行结束了,就是用jstack看不到main线程了,其他线程也会继续执行
恩,上面的程序跑了不止24小时,main线程执行结束了,也用poison把写线程结束了,只剩下生产者线程了.....