package com.besttone.analyzer; import java.io.Reader; import java.util.Map; import org.apache.solr.analysis.BaseTokenizerFactory; public class CommaTokenizerFactory extends BaseTokenizerFactory { @Override public void init(Map<String, String> args) { super.init(args); assureMatchVersion(); } public CommaTokenizer create(Reader input) { return new CommaTokenizer(luceneMatchVersion, input); } }
package com.besttone.analyzer; import java.io.Reader; import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; public class CommaTokenizer extends CharTokenizer { /** * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version * to match See {@link <a href="#version">above</a>} * * @param in * the input to split up into tokens */ public CommaTokenizer(Version matchVersion, Reader in) { super(matchVersion, in); } /** * Construct a new WhitespaceTokenizer using a given {@link AttributeSource} * . * * @param matchVersion * Lucene version to match See * {@link <a href="#version">above</a>} * @param source * the attribute source to use for this {@link Tokenizer} * @param in * the input to split up into tokens */ public CommaTokenizer(Version matchVersion, AttributeSource source, Reader in) { super(matchVersion, source, in); } /** * Construct a new WhitespaceTokenizer using a given * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * * @param matchVersion * Lucene version to match See * {@link <a href="#version">above</a>} * @param factory * the attribute factory to use for this {@link Tokenizer} * @param in * the input to split up into tokens */ public CommaTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { super(matchVersion, factory, in); } /** * Construct a new CommaTokenizer. * * @deprecated use {@link #CommaTokenizer(Version, Reader)} instead. This * will be removed in Lucene 4.0. */ @Deprecated public CommaTokenizer(Reader in) { super(in); } /** * Construct a new CommaTokenizer using a given {@link AttributeSource}. * * @deprecated use {@link #CommaTokenizer(Version, AttributeSource, Reader)} * instead. This will be removed in Lucene 4.0. */ @Deprecated public CommaTokenizer(AttributeSource source, Reader in) { super(source, in); } /** * Construct a new CommaTokenizer using a given * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * * @deprecated use * {@link #CommaTokenizer(Version, AttributeSource.AttributeFactory, Reader)} * instead. This will be removed in Lucene 4.0. */ @Deprecated public CommaTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } /** * Collects only characters which do not satisfy * {@link Character#isWhitespace(int)}. */ @Override protected boolean isTokenChar(int c) { // return !Character.isWhitespace(c); // 44表示逗号 return !(c == 44); } }
其实很简单,只要继承一下solr提供的通用字符分词器:CharTokenizer,然后实现自己的isTokenChar方法:
protected boolean isTokenChar(int c) { // return !Character.isWhitespace(c); // 44表示逗号 return !(c == 44); }
判断是否等于44,如果等于就返回false,否则返回true。返回false表示分词。44是逗号的asc码值,比如a的asc码值为97,如果不知道一个字符对应的值为多少,可以这样:
char[] c = new char[]{'a',',','b'};
Character.codePointAt(c, 1);
获得char数组里index为1的字符的asc码值。
然后打包成jar,放到solr_home/lib下面,或者其他地方也行,但是要在solrconfig.xml里配置lib的路径或者solr.xml里配置sharelib的路径都行,总之就是要solr启动时加载这个jar包。
然后就可以在solr控制台的analysis页面测试一下分词效果了。
<fieldType name="text_comma" class="solr.TextField" positionIncrementGap="100"> <analyzer> <tokenizer class="com.besttone.analyzer.CommaTokenizerFactory"/> </analyzer> </fieldType>