package com.qfang.util; import java.io.BufferedReader; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; import java.util.LinkedList; import java.util.Queue; import com.chenlb.mmseg4j.Chunk; import com.chenlb.mmseg4j.Seg; import com.chenlb.mmseg4j.Sentence; import com.chenlb.mmseg4j.Word; /** * 本类修改自{@link com.chenlb.mmseg4j.MMSeg}</br> * NOTE:<ul> * <li>1.<b>非线程安全</b></li> * <li>2.修改了{@link com.chenlb.mmseg4j.MMSeg}对于英文、数字、汉字混合关键字的处理逻辑,</br> * 现在可以混合使用中英数字文了,英文和数字不会变成断字符</li> * <li>3.不可将此类用在其他不需要混合中英文关键字的情况。请使用原类{@link com.chenlb.mmseg4j.MMSeg}</li> * <li>4.由于混合处理,生成的{@link com.chenlb.mmseg4j.Word}类的数字、英文、汉字的type都是</br>TYPE_WORD = "word", * 尚未在lucene查询索引使用MMSegAnalyser实验过,请自行测试</li> * </ul> * * @author yoara */ public class CnEnMMSeg { private PushbackReader reader; private Seg seg; private StringBuilder bufSentence = new StringBuilder(256); private Sentence currentSentence; private Queue<Word> bufWord; // word 缓存, 因为有 chunk 分析三个以上. public CnEnMMSeg(Reader input, Seg seg) { this.seg = seg; reset(input); } private int readedIdx = 0; public void reset(Reader input) { this.reader = new PushbackReader(new BufferedReader(input), 20); currentSentence = null; bufWord = new LinkedList<Word>(); bufSentence.setLength(0); readedIdx = -1; } private int readNext() throws IOException { int d = reader.read(); if(d > -1) { readedIdx++; //d = Character.toLowerCase(d); } return d; } private void pushBack(int data) throws IOException { readedIdx--; reader.unread(data); } public Word next() throws IOException { //先从缓存中取 Word word = bufWord.poll();; if(word == null) { bufSentence.setLength(0); int data = -1; boolean read = true; while(read && (data=readNext()) != -1) { read = false; //默认一次可以读出同一类字符,就可以分词内容 int type = Character.getType(data); String wordType = Word.TYPE_WORD; switch(type) { case Character.UPPERCASE_LETTER: case Character.LOWERCASE_LETTER: case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: /* * 1. 0x410-0x44f -> А-я //俄文 * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 */ data = toAscii(data); NationLetter nl = getNation(data); if(nl == NationLetter.UNKNOW) { read = true; break; } wordType = Word.TYPE_LETTER; bufSentence.appendCodePoint(data); switch(nl) { case EN: //字母后面的数字,如: VH049PA readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther()); currentSentence = createSentence(bufSentence); break; case RA: readChars(bufSentence, new ReadCharByRussia()); bufWord.add(createWord(bufSentence, wordType)); break; case GE: readChars(bufSentence, new ReadCharByGreece()); bufWord.add(createWord(bufSentence, wordType)); break; } bufSentence.setLength(0); break; case Character.OTHER_LETTER: case Character.DECIMAL_DIGIT_NUMBER: /* * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 * 3. 数字 */ bufSentence.appendCodePoint(data); readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther()); currentSentence = createSentence(bufSentence); bufSentence.setLength(0); break; case Character.LETTER_NUMBER: // ⅠⅡⅢ 单分 bufSentence.appendCodePoint(data); readChars(bufSentence, new ReadCharByType(new int[]{Character.LETTER_NUMBER})); int startIdx = startIdx(bufSentence); for(int i=0; i<bufSentence.length(); i++) { bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER)); } bufSentence.setLength(0); //缓存的字符清除 break; case Character.OTHER_NUMBER: //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用 bufSentence.appendCodePoint(data); readChars(bufSentence, new ReadCharByType(new int[]{Character.OTHER_NUMBER})); bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER)); bufSentence.setLength(0); break; default : //其它认为无效字符 read = true; }//switch } // 中文分词 if(currentSentence != null) { do { Chunk chunk = seg.seg(currentSentence); for(int i=0; i<chunk.getCount(); i++) { bufWord.add(chunk.getWords()[i]); } } while (!currentSentence.isFinish()); currentSentence = null; } word = bufWord.poll(); } return word; } /** * 读取下一串指定类型字符. * * @author chenlb 2009-8-15下午09:09:50 */ private static abstract class ReadChar { /** * 这个字符是否读取, 不读取也不会读下一个字符. */ abstract boolean isRead(int codePoint); int transform(int codePoint) { return codePoint; } } /** * 读取下一串指定类型的字符放到 bufSentence 中. * @param bufSentence * @param readChar 判断字符的细节. * @return 返回读取的个数 * @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的. */ private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException { int num = 0; int data = -1; while((data = readNext()) != -1) { int d = readChar.transform(data); if(readChar.isRead(d)) { bufSentence.appendCodePoint(d); num++; } else { //不是数字回压,要下一步操作 pushBack(data); break; } } return num; } /**读取数字*/ private static class ReadCharDigit extends ReadChar { boolean isRead(int codePoint) { int type = Character.getType(codePoint); return isDigit(type); } int transform(int codePoint) { return toAscii(codePoint); } } /**读取字母或数字*/ private static class ReadCharByAsciiOrDigit extends ReadCharDigit { boolean isRead(int codePoint) { boolean isRead = super.isRead(codePoint); return isAsciiLetter(codePoint) || isRead; } } /**读取字母或数字或其他字符*/ private static class ReadCharByAsciiOrDigitOrOther extends ReadCharByAsciiOrDigit { boolean isRead(int codePoint) { boolean isRead = super.isRead(codePoint); int type = Character.getType(codePoint); return isRead ||isCJK(type); } } /**读取字母*/ @SuppressWarnings("unused") private static class ReadCharByAscii extends ReadCharDigit { boolean isRead(int codePoint) { return isAsciiLetter(codePoint); } } /**读取俄语*/ private static class ReadCharByRussia extends ReadCharDigit { boolean isRead(int codePoint) { return isRussiaLetter(codePoint); } } /**读取希腊 */ private static class ReadCharByGreece extends ReadCharDigit { boolean isRead(int codePoint) { return isGreeceLetter(codePoint); } } /**读取指定类型组的字符*/ private static class ReadCharByType extends ReadChar { int[] charType; public ReadCharByType(int[] charType) { this.charType = charType; } boolean isRead(int codePoint) { int type = Character.getType(codePoint); boolean isRead = false; for(int cType:charType){ isRead = (type == cType); } return isRead; } } private Word createWord(StringBuilder bufSentence, String type) { return new Word(toChars(bufSentence), startIdx(bufSentence), type); } private Word createWord(StringBuilder bufSentence, int startIdx, String type) { return new Word(toChars(bufSentence), startIdx, type); } private Sentence createSentence(StringBuilder bufSentence) { return new Sentence(toChars(bufSentence), startIdx(bufSentence)); } /**取得 bufSentence 的第一个字符在整个文本中的位置*/ private int startIdx(StringBuilder bufSentence) { return readedIdx - bufSentence.length() + 1; } /**从 StringBuilder 里复制出 char[] */ private static char[] toChars(StringBuilder bufSentence) { char[] chs = new char[bufSentence.length()]; bufSentence.getChars(0, bufSentence.length(), chs, 0); return chs; } /** * 双角转单角 */ private static int toAscii(int codePoint) { if((codePoint>=65296 && codePoint<=65305) //0-9 || (codePoint>=65313 && codePoint<=65338) //A-Z || (codePoint>=65345 && codePoint<=65370) //a-z ) { codePoint -= 65248; } return codePoint; } private static boolean isAsciiLetter(int codePoint) { return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z'); } private static boolean isRussiaLetter(int codePoint) { return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё'; } private static boolean isGreeceLetter(int codePoint) { return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω'); } /** * EN -> 英语 * RA -> 俄语 * GE -> 希腊 * */ private static enum NationLetter {EN, RA, GE, UNKNOW}; private NationLetter getNation(int codePoint) { if(isAsciiLetter(codePoint)) { return NationLetter.EN; } if(isRussiaLetter(codePoint)) { return NationLetter.RA; } if(isGreeceLetter(codePoint)) { return NationLetter.GE; } return NationLetter.UNKNOW; } private static boolean isCJK(int type) { return type == Character.OTHER_LETTER; } private static boolean isDigit(int type) { return type == Character.DECIMAL_DIGIT_NUMBER; } @SuppressWarnings("unused") private static boolean isLetter(int type) { return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER; } }
使用方式:
/** 获得分词算法处理类 * @param text 输入文本段 * @param dicPath 文件路径,用于判断生成哪类词库 * **/ private static CnEnMMSeg makeMMSeg(String text, String dicPath) { //数据对象转换成流 StringReader input = new StringReader(text); Dictionary dic = getDictionary(dicPath); //选用正向最大匹配 Seg seg = new ComplexSeg(dic); CnEnMMSeg mmSeg = new CnEnMMSeg(input, seg); return mmSeg; }