mmseg4j支持单个字母、数字及组合搜索

原文地址:http://blog.csdn.net/july_2/article/details/24481935         
如题,看到这个题目也许觉得功能有些多余,字母、数字连在一块的话,是不会单独分出来的,分词时候是连在一块的,也算正常搜素需求。如输入 :

        String txt = "IBM12二次修改123";        分词效果:

        i bm |123 | 二 | 次 | 修 | 改

        现在,有一个需求:需要对字母、数字都分词,分词效果要达到:

        i | b | m |  1 | 2 | 3 | 二 | 次 | 修 | 改

        类似在数据库中使用like加百分号双向查询效果,使用最初版本的mmseg4j无法满足需求,经过阅读mmseg4j部分源代码,稍微修改了一点点,即可满足需求(暂不考虑效率)。

  •  未修改前通过单词,可以查询,通过字母查询不到结果如下图:

         单词完全匹配搜素:

           字母模糊搜索:

  • 修改mmseg4j源代码MMSeg.java中的next部分代码,其实就是屏蔽了部分代码,很简单:

        

[plain]  view plain copy
  1. public Word next() throws IOException {  
  2.         //先从缓存中取  
  3.         Word word = bufWord.poll();  
  4.         if(word == null) {  
  5.             bufSentence.setLength(0);  
  6.   
  7.             int data = -1;  
  8.             boolean read = true;  
  9. //          while(read && (data=readNext()) != -1) {  
  10.             while((data=readNext()) != -1) {  
  11.                 read = false;   //默认一次可以读出同一类字符,就可以分词内容  
  12.                 int type = Character.getType(data);  
  13.                 String wordType = Word.TYPE_WORD;  
  14.                 switch(type) {  
  15.                 case Character.UPPERCASE_LETTER:  
  16.                 case Character.LOWERCASE_LETTER:  
  17.                 case Character.TITLECASE_LETTER:  
  18.                 case Character.MODIFIER_LETTER:  
  19.                     /*  
  20.                      * 1. 0x410-0x44f -> А-я //俄文  
  21.                      * 2. 0x391-0x3a9 -> Α-Ω //希腊大写  
  22.                      * 3. 0x3b1-0x3c9 -> α-ω //希腊小写  
  23.                      */  
  24.                     data = toAscii(data);  
  25.                     NationLetter nl = getNation(data);  
  26.                     if(nl == NationLetter.UNKNOW) {  
  27.                         read = true;  
  28.                         break;  
  29.                     }  
  30.                     wordType = Word.TYPE_LETTER;  
  31.                     bufSentence.appendCodePoint(data);  
  32.                     switch(nl) {  
  33.                     case EN:  
  34.                         //字母后面的数字,如: VH049PA  
  35. //                      ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();  
  36. //                      readChars(bufSentence, rcad);  
  37. //                      if(rcad.hasDigit()) {  
  38. //                          wordType = Word.TYPE_LETTER_OR_DIGIT;  
  39. //                      }  
  40.                         //only english  
  41.                         //readChars(bufSentence, new ReadCharByAscii());  
  42.                         break;  
  43.                     case RA:  
  44.                         readChars(bufSentence, new ReadCharByRussia());  
  45.                         break;  
  46.                     case GE:  
  47.                         readChars(bufSentence, new ReadCharByGreece());  
  48.                         break;  
  49.                     }  
  50.                     bufWord.add(createWord(bufSentence, wordType));  
  51.   
  52.                     bufSentence.setLength(0);  
  53.   
  54.                     break;  
  55.                 case Character.OTHER_LETTER:  
  56.                     /*  
  57.                      * 1. 0x3041-0x30f6 -> ぁ-ヶ   //日文(平|片)假名  
  58.                      * 2. 0x3105-0x3129 -> ㄅ-ㄩ   //注意符号  
  59.                      */  
  60.                     bufSentence.appendCodePoint(data);  
  61.                     readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));  
  62.   
  63.                     currentSentence = createSentence(bufSentence);  
  64.   
  65.                     bufSentence.setLength(0);  
  66.   
  67.                     break;  
  68.                 case Character.DECIMAL_DIGIT_NUMBER:  
  69.                     bufSentence.appendCodePoint(toAscii(data));  
  70. //                  readChars(bufSentence, new ReadCharDigit());    //读后面的数字, AsciiLetterOr  
  71.                     wordType = Word.TYPE_DIGIT;  
  72.                     int d = readNext();  
  73.                     if(d > -1) {  
  74.                         if(seg.isUnit(d)) { //单位,如时间  
  75.                             bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立)  
  76.   
  77.                             bufSentence.setLength(0);  
  78.   
  79.                             bufSentence.appendCodePoint(d);  
  80.                             wordType = Word.TYPE_WORD;  //单位是 word  
  81.                         } else {    //后面可能是字母和数字  
  82.                             pushBack(d);  
  83. //                          if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {   //如果有字母或数字都会连在一起.  
  84. //                              wordType = Word.TYPE_DIGIT_OR_LETTER;  
  85. //                          }  
  86.                         }  
  87.                     }  
  88.   
  89.                     bufWord.add(createWord(bufSentence, wordType));  
  90.   
  91.   
  92.                     bufSentence.setLength(0);   //缓存的字符清除  
  93.   
  94.                     break;  
  95.                 case Character.LETTER_NUMBER:  
  96.                     // ⅠⅡⅢ 单分  
  97.                     bufSentence.appendCodePoint(data);  
  98.                     readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));  
  99.   
  100.                     int startIdx = startIdx(bufSentence);  
  101.                     for(int i=0; i
  102.                         bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));  
  103.                     }  
  104.   
  105.                     bufSentence.setLength(0);   //缓存的字符清除  
  106.   
  107.                     break;  
  108.                 case Character.OTHER_NUMBER:  
  109.                     //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用  
  110.                     bufSentence.appendCodePoint(data);  
  111.                     readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));  
  112.   
  113.                     bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));  
  114.                     bufSentence.setLength(0);  
  115.                     break;  
  116.                 default :  
  117.                     //其它认为无效字符  
  118.                     read = true;  
  119.                 }//switch  
  120.             }  
  121.                   
  122.             // 中文分词  
  123.             if(currentSentence != null) {  
  124.                 do {  
  125.                     Chunk chunk = seg.seg(currentSentence);  
  126.                     for(int i=0; i
  127.                         bufWord.add(chunk.getWords()[i]);  
  128.                     }  
  129.                 } while (!currentSentence.isFinish());  
  130.                   
  131.                 currentSentence = null;  
  132.             }  
  133.               
  134.             word = bufWord.poll();  
  135.         }  
  136.           
  137.         return word;  
  138.     }  
         主要是注释了一些代码,对字母、数字不要连续处理。

  • 再次搜索字母查询,效果如下:


         综上,这样就简单完成了数据库中类似like和百分号双向匹配需求。

你可能感兴趣的:(solr)