Lucene:如何写一个自定义的分词器

实现一个自定义分词器

实现一个简单的英文分词器,主要分为以下几个步骤:

1.建立自己的Attribute接口MyCharAttribute

 1 /**
 2 * MyCharAttribute
 3 *
 4 * @author limingcheng
 5 * @Date 2019/11/28
 6 */
 7 public interface MyCharAttribute extends Attribute {
 8 void setChars(char[] buffer, int length);
 9 
10 char[] getChars();
11 
12 int getLength();
13 
14 String getString();
15 }

2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl

  1 /**
  2 * MyCharAttributeImpl
  3 * 2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl
  4 * 注意:MyCharAttributeImpl一定要和MyCharAttribute放在一个包下,否则会出现没有MyCharAttribute的实现类,
  5 * 这是由org.apache.lucene.util.AttributeFactory.DefaultAttributeFactory.findImplClass(Class)这个方法决定的
  6 * @author limingcheng
  7 * @Date 2019/11/28
  8 */
  9 public class MyCharAttributeImpl extends AttributeImpl implements MyCharAttribute {
 10 
 11 private char[] chatTerm = new char[255];
 12 private int length = 0;
 13 
 14 @Override
 15 public void setChars(char[] buffer, int length) {
 16 this.length = length;
 17 if (length > 0) {
 18 System.arraycopy(buffer, 0, this.chatTerm, 0, length);
 19 }
 20 }
 21 
 22 @Override
 23 public char[] getChars() {
 24 return this.chatTerm;
 25 }
 26 
 27 @Override
 28 public int getLength() {
 29 return this.length;
 30 }
 31 
 32 @Override
 33 public String getString() {
 34 if (this.length > 0) {
 35 return new String(this.chatTerm, 0, length);
 36 }
 37 return null;
 38 }
 39 
 40 @Override
 41 public void clear() {
 42 this.length = 0;
 43 }
 44 
 45 @Override
 46 public void reflectWith(AttributeReflector reflector) {
 47 
 48 }
 49 
 50 @Override
 51 public void copyTo(AttributeImpl target) {
 52 
 53 }
 54 }
 55 3.建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词
 56 /**
 57 * MyWhitespaceTokenizer
 58 *
 59 * 3. 建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词
 60 * @author limingcheng
 61 * @Date 2019/11/28
 62 */
 63 public class MyWhitespaceTokenizer extends Tokenizer {
 64 
 65 // 需要记录的属性
 66 //
 67 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class);
 68 
 69 // 存词的出现位置
 70 
 71 // 存放词的偏移
 72 
 73 //
 74 char[] buffer = new char[255];
 75 int length = 0;
 76 int c;
 77 
 78 @Override
 79 public boolean incrementToken() throws IOException {
 80 // 清除所有的词项属性
 81 clearAttributes();
 82 length = 0;
 83 while (true) {
 84 c = this.input.read();
 85 
 86 if (c == -1) {
 87 if (length > 0) {
 88 // 复制到charAttr
 89 this.charAttr.setChars(buffer, length);
 90 return true;
 91 } else {
 92 return false;
 93 }
 94 }
 95 
 96 if (Character.isWhitespace(c)) {
 97 if (length > 0) {
 98 // 复制到charAttr
 99 this.charAttr.setChars(buffer, length);
100 return true;
101 }
102 }
103 
104 buffer[length++] = (char) c;
105 }
106 }
107 
108 }

4.建立分项过滤器:把大写字母转换为小写字母

 1 /**
 2 * MyLowerCaseTokenFilter
 3 *
 4 * 4.建立分项过滤器:把大写字母转换为小写字母
 5 * @author limingcheng
 6 * @Date 2019/11/28
 7 */
 8 public class MyLowerCaseTokenFilter extends TokenFilter {
 9 public MyLowerCaseTokenFilter(TokenStream input) {
10 super(input);
11 }
12 
13 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class);
14 
15 @Override
16 public boolean incrementToken() throws IOException {
17 boolean res = this.input.incrementToken();
18 if (res) {
19 char[] chars = charAttr.getChars();
20 int length = charAttr.getLength();
21 if (length > 0) {
22 for (int i = 0; i < length; i++) {
23 chars[i] = Character.toLowerCase(chars[i]);
24 }
25 }
26 }
27 return res;
28 }
29 }

5.建立分析器

 1 /**
 2 * MyWhitespaceAnalyzer
 3 *
 4 * 5. 建立分析器
 5 * @author limingcheng
 6 * @Date 2019/11/28
 7 */
 8 public class MyWhitespaceAnalyzer extends Analyzer {
 9 @Override
10 protected TokenStreamComponents createComponents(String fieldName) {
11 Tokenizer source = new MyWhitespaceTokenizer();
12 TokenStream filter = new MyLowerCaseTokenFilter(source);
13 return new TokenStreamComponents(source, filter);
14 }
15 
16 public static void main(String[] args) {
17 
18 String text = "广州华为有限公司 An AttributeSource contains a list of different AttributeImpls, and methods to add and get them. ";
19 
20 try {
21 Analyzer ana = new MyWhitespaceAnalyzer();
22 TokenStream ts = ana.tokenStream("aa", text);
23 MyCharAttribute ca = ts.getAttribute(MyCharAttribute.class);
24 ts.reset();
25 while (ts.incrementToken()) {
26 System.out.print(ca.getString() + "|");
27 }
28 ts.end();
29 ana.close();
30 System.out.println();
31 } catch (IOException e) {
32 e.printStackTrace();
33 }
34 
35 }
36 }

一个简单的分词器可以这样实现,但是要实现一个可以对中文分词的分词器就需要算法方面的知识了。

本文参考:https://www.cnblogs.com/leeSmall/p/8993185.html

你可能感兴趣的:(Lucene:如何写一个自定义的分词器)