代码主要来源: 《Collective Intelligence 实战》
Lucene版本: 4.6.1
原来的代码是基于2.2写的,很多东西已经变了。现在用4.6.1重现实现一遍
package impl; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; public class PorterStemStopWordAnalyzer extends Analyzer { // 自定义停用词 private static final String[] stopWords = {"and", "of", "the", "to", "is", "their", "can", "all"}; public PorterStemStopWordAnalyzer() { } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { // 创建一个分词器 Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader); // 创建一系列的分词过滤器 TokenFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter, getSynonymMap(), true); TokenFilter stopFilter = new StopFilter(Version.LUCENE_46, synonymFilter, buildCharArraySetFromArry(stopWords)); TokenFilter stemFilter = new PorterStemFilter(stopFilter); // TokenStream的包装类 在2.2之中 是TokenStream return new TokenStreamComponents(tokenizer, stemFilter); } // 将数组转成lucene可识别的CharArraySet对象 CharArraySet类似java.util.set private CharArraySet buildCharArraySetFromArry(String[] array) { CharArraySet set = new CharArraySet(Version.LUCENE_46, array.length, true); for(String value : array) { set.add(value); } return set; } // 创建一个同义词表 private SynonymMap getSynonymMap() { String base1 = "fast"; String syn1 = "rapid"; String base2 = "slow"; String syn2 = "sluggish"; SynonymMap.Builder sb = new SynonymMap.Builder(true); sb.add(new CharsRef(base1), new CharsRef(syn1), true); sb.add(new CharsRef(base2), new CharsRef(syn2), true); SynonymMap smap = null; try { smap = sb.build(); } catch (IOException e) { e.printStackTrace(); } return smap; } // 测试方法 public static void testPorterStemmingAnalyzer() throws IOException { Analyzer analyzer = new PorterStemStopWordAnalyzer(); String text = "Collective intelligence and Web2.0, fast and rapid"; Reader reader = new StringReader(text); TokenStream ts = null; try { ts = analyzer.tokenStream(null, reader); ts.reset(); while(ts.incrementToken()) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); System.out.println(ta.toString()); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { testPorterStemmingAnalyzer(); } }
注意:
(1) TokenStream在初始化之后需要reset一次,不然会抛出异常
(2) 将TokenStream 转成Token 常用的一个方法就是使用CharTermAttribute
除了CharTermAttribute 还有其他的Attribute: 比如FlagsAttribute ...
(3) 使用到的类库可以参考上一篇文章:http://rangerwolf.iteye.com/admin/blogs/2011535
(4) 在createComponents方法之中使用了一个同义词过滤器,在构造这个过滤器的时候是通过getSynonymMap方法进行的。在测试样本之中的 fast and rapid 解析完成之后的结果如下:
fast rapid rapid
相当于有两个rapid! 可能是因为这是因为synonymFilter在stopFilter之前运行。
根据java doc 文档的秒速,同义词过滤器应该尽早的运行。比如second rule.
做了另外的一个测试:
String base3 = "Collective Intelligence"; String syn3 = "CI"; sb.add(new CharsRef(base3), new CharsRef(syn3), true);
即将Collective Intelligence 跟CI 同义
同样的样本的运行结果完全不变!
说明无法对词长度为2的词组进行同义词~