solr1.4 中文 庖丁 使用方法

由于solr1.4使用Lucene 2.9.1 ,故需要修改庖丁源代码:net.paoding.analysis.analyzer.PaodingTokenizer

需要注意的有两点

1. 继承关系由 TokenStream 调整为 Tokenizer,因此需要删除变量

private final Reader input;

删除对应的关闭方法

public void close() throws IOException {
   super.close();
   input.close();
}

2. 高亮显示功能由于底层实现变化,故需要重写reset方法。原先reset 只是调整input。现在需要将多个值重置。

public void reset(Reader input) throws IOException {
   this.input = input;
   this.inputLength=0;
   this.offset=0;
   this.dissected=0;
   this.tokenIteractor=null;
   this.beef.set(0, 0);
}

 

调整后的整体代码如下

package net.paoding.analysis.analyzer;

/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.Beef;
import net.paoding.analysis.knife.Collector;
import net.paoding.analysis.knife.Knife;
import net.paoding.analysis.knife.Paoding;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;


/**
* PaodingTokenizer是基于“庖丁解牛”框架的TokenStream实现,为PaodingAnalyzer使用。
* <p>
*
* @author Zhiliang Wang [[email protected]]
* @see Beef
* @see Knife
* @see Paoding
* @see Tokenizer
* @see PaodingAnalyzer
*
* @see Collector
* @see TokenCollector
* @see MAxTokenCollector
* @see MostWordsTokenCollector
*
* @since 1.0
*/
public final class PaodingTokenizer extends Tokenizer implements Collector {

// -------------------------------------------------

/**
* 从input读入的总字符数
*/
private int inputLength;

/**
*
*/
private static final int bufferLength = 128;

/**
* 接收来自{@link #input}的文本字符
*
* @see #next()
*/
private final char[] buffer = new char[bufferLength];

/**
* {@link buffer}[0]在{@link #input}中的偏移
*
* @see #collect(String, int, int)
* @see #next()
*/
private int offset;

/**
*
*/
private final Beef beef = new Beef(buffer, 0, 0);

/**
*
*/
private int dissected;

/**
* 用于分解beef中的文本字符,由PaodingAnalyzer提供
*
* @see #next()
*/
private Knife knife;

/**
*
*/
private TokenCollector tokenCollector;

/**
* tokens迭代器,用于next()方法顺序读取tokens中的Token对象
*
* @see #tokens
* @see #next()
*/
private Iterator/* <Token> */ tokenIteractor;

// -------------------------------------------------

/**
*
* @param input
* @param knife
* @param tokenCollector
*/
public PaodingTokenizer(Reader input, Knife knife, TokenCollector tokenCollector) {
   this.input = input;
   this.knife = knife;
   this.tokenCollector = tokenCollector;
}

// -------------------------------------------------

public TokenCollector getTokenCollector() {
   return tokenCollector;
}

public void setTokenCollector(TokenCollector tokenCollector) {
   this.tokenCollector = tokenCollector;
}

// -------------------------------------------------


public void collect(String word, int offset, int end) {
   tokenCollector.collect(word, this.offset + offset, this.offset + end);
}

// -------------------------------------------------
public Token next() throws IOException {
   // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据
   while (tokenIteractor == null || !tokenIteractor.hasNext()) {
    //System.out.println(dissected);
    int read = 0;
    int remainning = -1;//重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符
    if (dissected >= beef.length()) {
     remainning = 0;
    }
    else if (dissected < 0){
     remainning = bufferLength + dissected;
    }
    if (remainning >= 0) {
     if (remainning > 0) {
      System.arraycopy(buffer, -dissected, buffer, 0, remainning);
     }
     read = this.input.read(buffer, remainning, bufferLength - remainning);
     inputLength += read;
     int charCount = remainning + read;
     if (charCount < 0) {
      // reader已尽,按接口next()要求返回null.
      return null;
     }
     if (charCount < bufferLength) {
      buffer[charCount ++] = 0;
     }
     // 构造“牛”,并使用knife“解”之
     beef.set(0, charCount);
     offset += Math.abs(dissected);
     //offset -= remainning;
     dissected = 0;
    }
    dissected = knife.dissect((Collector)this, beef, dissected);
//    offset += read;// !!!
    tokenIteractor = tokenCollector.iterator();
   }
   // 返回tokensIteractor下一个Token对象
   return (Token) tokenIteractor.next();
}

public int getInputLength() {
   return inputLength;
}

//重新实现reset(input),切记需要抛出异常。
public void reset(Reader input) throws IOException {
   this.input = input;
   this.inputLength=0;
   this.offset=0;
   this.dissected=0;
   this.tokenIteractor=null;
   this.beef.set(0, 0);
}
}

 

 

另外,关于原先实现的中文切词中的create方法,需要修改返回的类型。代码如下

package net.paoding;

import java.io.Reader;
import java.util.Map;

import net.paoding.analysis.analyzer.PaodingTokenizer;
import net.paoding.analysis.analyzer.TokenCollector;
import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector;
import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.PaodingMaker;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;


/**
* Solr 1.4 paoding tokenizer factory
*
*/
public class ChineseTokenizerFactory extends BaseTokenizerFactory {

/**
* 最多分词,默认.
*/
public static final String MOST_WORDS_MODE = "most-words";

/**
* 最长分词.
*/
public static final String MAX_WORD_LENGTH_MODE = "max-word-length";

private String mode = null;

public void setMode(String mode) {
   if (mode==null||MOST_WORDS_MODE.equalsIgnoreCase(mode)
     || "default".equalsIgnoreCase(mode)) {

    this.mode=MOST_WORDS_MODE;
   } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) {
    this.mode=MAX_WORD_LENGTH_MODE;
   } else {
    throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode);
   }
}

    public void init(Map<String, String> args) {
        super.init(args);
        setMode(args.get("mode"));
    }

    public Tokenizer create(Reader input) {
    return new PaodingTokenizer(input, PaodingMaker.make(),createTokenCollector());
    }

    private TokenCollector createTokenCollector() {
        if( MOST_WORDS_MODE.equals(mode)) {
        return new MostWordsTokenCollector();
        }
        if( MAX_WORD_LENGTH_MODE.equals(mode)) {
        return new MaxWordLengthTokenCollector();
        }
        throw new Error("never happened");
    }
}

 

 

如需使用该分词器,则需要配置字段类型schema.xml 中的fieldType的tokenizer 为下面形式。具体class路径根据实际的ChineseTokenizerFactory位置填写。

<tokenizer class="net.paoding.ChineseTokenizerFactory" mode="most-words"/>

上面讲述的是solr 1.4配置庖丁的方式。如果您是从solr1.3升级,那么请务必注意下面这段文字:

如果是升级,则需要核对schema.xml 中的 field 的type 定义对应好(如果原来是 int 类型的要注意,solr 1.4 把 int 用 tint 类型了。如果要兼容,要把 int 的类型修改成 pint。其它 long,float 一样修改)具体对应关系可根据solr1.3的fieldType 中各种类型定义 与solr1.4中fieldType 各种类型定义对比。

 

文章出自幸福的小小仙 

[email protected][email protected]

你可能感兴趣的:(apache,.net,qq,Solr,Lucene)