需要修改IKAnalyzer.java、IKTokenizer.java、IKTokenizerFactory.java。
1
import java.io.Reader;
2
import org.apache.lucene.analysis.Analyzer;
3
import org.apache.lucene.analysis.Tokenizer;
4
5
/**
6
* 实现Lucene Analyzer 基于IKTokenizer的中文分词器
7
*
8
*
@author
林良益
9
*
10
*/
11
public
final
class IKAnalyzer
extends Analyzer {
12
13
private
boolean isMaxWordLength =
false;
14
15
/**
16
* IK分词器Lucene Analyzer接口实现类 默认最细粒度切分算法
17
*/
18
public IKAnalyzer() {
19
this(
false);
20 }
21
22
/**
23
* IK分词器Lucene Analyzer接口实现类
24
*
25
*
@param
isMaxWordLength
26
* 当为true时,分词器进行最大词长切分
27
*/
28
public IKAnalyzer(
boolean isMaxWordLength) {
29
super();
30
this.setMaxWordLength(isMaxWordLength);
31 }
32
33 @Override
34
public TokenStreamComponents createComponents(String fieldName,
35 Reader reader) {
36 Tokenizer tokenizer =
new IKTokenizer(reader, isMaxWordLength());
37
return
new TokenStreamComponents(tokenizer,
null);
38 }
39
40
public
void setMaxWordLength(
boolean isMaxWordLength) {
41
this.isMaxWordLength = isMaxWordLength;
42 }
43
44
public
boolean isMaxWordLength() {
45
return isMaxWordLength;
46 }
47
48 }
1
import java.io.IOException;
2
import java.io.Reader;
3
4
import org.apache.lucene.analysis.Tokenizer;
5
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7
import org.wltea.analyzer.IKSegmentation;
8
import org.wltea.analyzer.Lexeme;
9
10
/**
11
* IK Analyzer v3.2 Lucene4.x Tokenizer适配器类 它封装了IKSegmentation实现
12
*
13
*
@author
林良益
14
*
15
*/
16
public
final
class IKTokenizer
extends Tokenizer {
17
//
IK分词器实现
18
private IKSegmentation _IKImplement;
19
//
词元文本属性
20
private CharTermAttribute termAtt;
21
//
词元位移属性
22
private OffsetAttribute offsetAtt;
23
//
记录最后一个词元的结束位置
24
private
int finalOffset;
25
26
/**
27
* Lucene Tokenizer适配器类构造函数
28
*
29
*
@param
in
30
*
@param
isMaxWordLength
31
* 当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分
32
*/
33
public IKTokenizer(Reader in,
boolean isMaxWordLength) {
34
super(in);
35 offsetAtt = addAttribute(OffsetAttribute.
class);
36 termAtt = addAttribute(CharTermAttribute.
class);
37 _IKImplement =
new IKSegmentation(in, isMaxWordLength);
38 }
39
40 @Override
41
public
final
boolean incrementToken()
throws IOException {
42
//
清除所有的词元属性
43
clearAttributes();
44 Lexeme nextLexeme = _IKImplement.next();
45
if (nextLexeme !=
null) {
46
//
将Lexeme转成Attributes
47
//
设置词元文本
48
termAtt.setEmpty().append(nextLexeme.getLexemeText());
49
//
设置词元位移
50
offsetAtt.setOffset(nextLexeme.getBeginPosition(),
51 nextLexeme.getEndPosition());
52 offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
53 finalOffset = nextLexeme.getEndPosition();
54
//
返会true告知还有下个词元
55
return
true;
56 }
57
//
返会false告知词元输出完毕
58
return
false;
59 }
60
61
/*
62
* (non-Javadoc)
63
*
64
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
65
*/
66
public
void reset()
throws IOException {
67
super.reset();
68 _IKImplement.reset(input);
69 }
70
71 @Override
72
public
final
void end() {
73 offsetAtt.setOffset(finalOffset, finalOffset);
74 }
75 }
1
import java.io.Reader;
2
import java.util.Map;
3
4
import org.apache.lucene.analysis.Tokenizer;
5
import org.apache.lucene.analysis.util.TokenizerFactory;
6
import org.wltea.analyzer.lucene.IKTokenizer;
7
8
/**
9
* 实现Solr4.x分词器接口
10
* 基于IKTokenizer的实现
11
*
12
*
@author
林良益、李良杰
13
*
14
*/
15
public
final
class IKTokenizerFactory
extends TokenizerFactory{
16
17
private
boolean isMaxWordLength =
false;
18
19
/**
20
* IK分词器Solr TokenizerFactory接口实现类
21
* 默认最细粒度切分算法
22
*/
23
public IKTokenizerFactory(){
24 }
25
26
/*
27
* (non-Javadoc)
28
* @see org.apache.solr.analysis.BaseTokenizerFactory#init(java.util.Map)
29
*/
30
public
void init(Map<String,String> args){
31 String _arg = args.get("isMaxWordLength");
32 isMaxWordLength = Boolean.parseBoolean(_arg);
33 }
34
35
/*
36
* (non-Javadoc)
37
* @see org.apache.solr.analysis.TokenizerFactory#create(java.io.Reader)
38
*/
39
public Tokenizer create(Reader reader) {
40
return
new IKTokenizer(reader , isMaxWordLength());
41 }
42
43
public
void setMaxWordLength(
boolean isMaxWordLength) {
44
this.isMaxWordLength = isMaxWordLength;
45 }
46
47
public
boolean isMaxWordLength() {
48
return isMaxWordLength;
49 }
50 }