可以下载一个词库这里用的是搜狗的词库
Segmentation.java
package cn.strong;
import java.util.HashMap;
public class Segmentation {
private String tem = null;
private HashMap mapDic, len;
public Segmentation(HashMap mapDic, HashMap len) {
this.mapDic = mapDic;
this.len = len;
}
//source是输入的中文
//正向最大匹配
public String Fmm(String source) {
String[] targets = new String[source.length()];
String target = "";
//资源的最大长度
int MaxLen = source.length();
//将最大的长度赋给temLen变量
int temLen = MaxLen;
int primarylen = 0;
while (true) {
//判断temLen有没有对应的value值 有的返回true否则返回false
if (len.containsKey(temLen)) {
tem = source.substring(primarylen, temLen);
//判断词库里面有没有tem对应的值 或者只剩下一个单字的时候
if (mapDic.containsKey(tem) || temLen - primarylen == 1) {
primarylen = temLen;
temLen = MaxLen;
if (primarylen == MaxLen)
target = target + tem;
else
target = target + tem + "/";
} else
temLen--;
} else
temLen--;
if (primarylen == MaxLen)
break;
}
return target;
}
//反相最大匹配
public String Bmm(String source) {
String[] targets = new String[source.length()];
String target="";
int MaxLen = source.length();
int temLen = MaxLen;
int primarylen = 0;
int i=0;
while (true) {
if (len.containsKey(temLen)) {
tem = source.substring(primarylen, temLen);
if (mapDic.containsKey(tem)||temLen-primarylen==1) {
if (temLen == MaxLen){
targets[i] = tem;
}
else{
tem = tem+"/";
targets[i] = tem;
}
temLen = primarylen;
primarylen = 0;
i++;
} else
primarylen++;
} else
primarylen++;
if (temLen == 0)
break;
}
for(int j=i-1;j>=0;j--)
target+=targets[j];
return target;
}
}
GenerateDictionary.java
package cn.strong;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
public class GenerateDictionary {
public void GenHashDic(String filename, HashMap hm, HashMap len) throws FileNotFoundException, IOException {
String s = new String();
BufferedReader in = new BufferedReader(new FileReader(filename));
//在此处会循环词库的词数个数次
while ((s = in.readLine()) != null) {
hm.put(s, s.length());
len.put(s.length(), s);
}
}
}
Test.java测试
package cn.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import cn.strong.GenerateDictionary;
import cn.strong.Segmentation;
public class Test {
public static void main(String[] args) throws IOException {
String filename = "sougou.txt";
HashMap hm = new HashMap();
HashMap len = new HashMap();
GenerateDictionary genDic = new GenerateDictionary();
Segmentation seg;
long genStart = System.currentTimeMillis();
genDic.GenHashDic(filename, hm, len);
System.out.println("GenHashDic 消耗时间:"+(System.currentTimeMillis() - genStart));
System.out.println("请输入您需要分解的语句:");
InputStreamReader reader = new InputStreamReader(System.in);
BufferedReader br = new BufferedReader(reader);
String data = "";
data = br.readLine();
seg = new Segmentation(hm, len);
long emmStart = System.currentTimeMillis();
String FmmTarget = seg.Fmm(data);
System.out.println("FMM 算法共花费时间为:"+(System.currentTimeMillis()-emmStart));
long bmmStart = System.currentTimeMillis();
String BmmTarget = seg.Bmm(data);
System.out.println("BMM 算法共花费时间为:"+(System.currentTimeMillis()-bmmStart));
System.out.println("FMM算法统计结果为: " + FmmTarget);
System.out.println("BMM算法统计结果为: " + BmmTarget);
}
}
GenHashDic 消耗时间:1337
请输入您需要分解的语句:
中国四川省成都市青羊区少城路9号人民公园
FMM 算法共花费时间为:0
BMM 算法共花费时间为:0
FMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园
BMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园