1
2、语汇单元的结构解释
3、同义词的设计思路
4、分词器的比较和测试
package org.lucene.test; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.lucene.util.AnalyzerUtils; import org.lucene.util.MySameAnalyzer; import org.lucene.util.MyStopAnalyzer; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; public class TestAnalyzer { /** * 几种分词器在英文分词下面的比较 */ @Test public void test01(){ //标准分词器 Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35); //停用词分词器 Analyzer a2 = new StopAnalyzer(Version.LUCENE_35); //简单分词器 Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35); //空格分词器 Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35); String txt = "this is my house,I am come from yunnang zhaotong," + "My email is [email protected],My QQ is 707807876"; AnalyzerUtils.displayToken(txt, a1); //[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail.com][my][qq][707807876] AnalyzerUtils.displayToken(txt, a2); //[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail][com][my][qq] AnalyzerUtils.displayToken(txt, a3); //[this][is][my][house][i][am][come][from][yunnang][zhaotong][my][email][is][ynkonghao][gmail][com][my][qq][is] AnalyzerUtils.displayToken(txt, a4); //[this][is][my][house,I][am][come][from][yunnang][zhaotong,My][email][is][[email protected],My][QQ][is][707807876] } /** * 几种分词器在中文分词下面的比较 */ @Test public void test02(){ //标准分词器 Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35); //停用词分词器 Analyzer a2 = new StopAnalyzer(Version.LUCENE_35); //简单分词器 Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35); //空格分词器 Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35); String txt = "我来自云南昭通昭阳区师专"; AnalyzerUtils.displayToken(txt, a1); //[我][来][自][云][南][昭][通][昭][阳][区][师][专] AnalyzerUtils.displayToken(txt, a2); //[我来自云南昭通昭阳区师专] AnalyzerUtils.displayToken(txt, a3); //[我来自云南昭通昭阳区师专] AnalyzerUtils.displayToken(txt, a4); //[我来自云南昭通昭阳区师专] } /** * 打印分词的详细信息 */ @Test public void test03(){ //标准分词器 Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35); //停用词分词器 Analyzer a2 = new StopAnalyzer(Version.LUCENE_35); //简单分词器 Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35); //空格分词器 Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35); String txt = "how are you thank you"; AnalyzerUtils.displayAllToken(txt, a1); AnalyzerUtils.displayAllToken(txt, a2); AnalyzerUtils.displayAllToken(txt, a3); AnalyzerUtils.displayAllToken(txt, a4); } /** * 停用词的测试 */ @Test public void test04(){ Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate"}); Analyzer a2 = new StopAnalyzer(Version.LUCENE_35); String txt = "how are You thAnk's you I hate you"; AnalyzerUtils.displayToken(txt, a1); AnalyzerUtils.displayToken(txt, a2); } /** * 中文分词测试 * 使用词库分词,自己可扩展词库 */ @Test public void test05(){ // Analyzer a1 = new MMSegAnalyzer();//未加入该分词器自带的词库 //[我][来][自][云][南][昭][通][昭][阳][区][师][专] //导入分词的词典便有词库 Analyzer a1 = new MMSegAnalyzer(new File("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data")); //[我][来自][云南][昭][通][昭][阳][区][师专] //可以在data文件下面的words-my.dic扩展自己的词典,比如加了昭通,分词结果为: //[我][来自][云南][昭通][昭][阳][区][师专] String txt = "我来自云南昭通昭阳区师专"; AnalyzerUtils.displayToken(txt, a1); } /** * 同义词测试 * @throws IOException * @throws CorruptIndexException */ @Test public void test06() throws CorruptIndexException, IOException{ Analyzer a1 = new MySameAnalyzer(); String txt = "我来自中国云南昭通昭阳区师专"; AnalyzerUtils.displayAllToken(txt, a1); String keyword = "俺"; Directory dire = new RAMDirectory(); IndexWriter indexWriter = new IndexWriter(dire,new IndexWriterConfig(Version.LUCENE_35, a1)); Document doc = new Document(); doc.add(new Field("content",txt,Field.Store.YES,Field.Index.ANALYZED)); indexWriter.addDocument(doc); indexWriter.close(); IndexSearcher search = new IndexSearcher(IndexReader.open(dire)); TopDocs topDoc = search.search(new TermQuery(new Term("content",keyword)),10); ScoreDoc[] scoreDoc = topDoc.scoreDocs; for(ScoreDoc score : scoreDoc){ Document doc1 = search.doc(score.doc); System.out.println(doc1.get("content")); } } }
package org.lucene.util; import java.io.IOException; import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; /** * 扩展自己的停用词分词器 * @author user * */ public class MyStopAnalyzer extends Analyzer{ private Set stops; public MyStopAnalyzer(String[] sws){ //会自动将字符串数组转化为Set stops = StopFilter.makeStopSet(Version.LUCENE_35, sws, true); //把原来的停用词给加进来 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public MyStopAnalyzer(){ //获取原有的停用词 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } @Override public TokenStream tokenStream(String fieldName, Reader reader) { System.out.println("//------------------------------------"); Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_35,reader); // Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_35,reader); CharTermAttribute cta = tokenizer.addAttribute(CharTermAttribute.class); try { while(tokenizer.incrementToken()){ System.out.println(cta); } } catch (IOException e) { e.printStackTrace(); } System.out.println("------------------------------------\\"); //为这个分词器设定过滤链和Tokenizer return new StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), stops); } }6、分词器的扩展,同义词分词器
package org.lucene.util; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MaxWordSeg; import com.chenlb.mmseg4j.analysis.MMSegTokenizer; /** * 分词器的扩展,同义词分词器 * @author user * */ public class MySameAnalyzer extends Analyzer{ @Override public TokenStream tokenStream(String fieldName, Reader reader) { Dictionary dic = Dictionary.getInstance("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data"); return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)); } }
package org.lucene.util; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Stack; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; /** * 同义词过滤器的扩展 * @author user * */ public class MySameTokenFilter extends TokenFilter{ private CharTermAttribute cta = null; private PositionIncrementAttribute pia = null; private AttributeSource.State current = null; private Stack<String> sames = null; protected MySameTokenFilter(TokenStream input) { super(input); cta = this.addAttribute(CharTermAttribute.class); pia = this.addAttribute(PositionIncrementAttribute.class); sames = new Stack<String>(); } /** * 思想如下: * 其实每个同义词都要放在CharTermAttribute里面,但是如果直接cta.append("大陆");的话 * 那会直接把原来的词和同义词连接在同一个语汇单元里面[中国大陆],这样是不行的 * 要的是这样的效果[中国][大陆] * 那么就要在遇到同义词的时候把当前的状态保存一份,并把同义词的数组放入栈中, * 这样在下一个语汇单元的时候判断同义词数组是否为空,不为空的话把之前的保存的一份状态 * 还原,然后在修改之前状态的值cta.setEmpty(),然后在把同义词的值加入cta.append("大陆") * 再把位置增量设为0,pia.setPositionIncrement(0),这样的话就表示是同义词, * 接着把该同义词的语汇单元返回 */ @Override public boolean incrementToken() throws IOException { while(sames.size() > 0){ //将元素出栈,并获取这个同义词 String str = sames.pop(); //还原状态 restoreState(current); cta.setEmpty(); cta.append(str); //设置位置 pia.setPositionIncrement(0); return true; } if(!input.incrementToken()) return false; if(getSameWords(cta.toString())){ //如果有同义词将当前状态先保存 current = captureState(); } return true; } /* * 使用这种方式是不行的,这种会把的结果是[中国]替换成了[大陆] * 而不是变成了[中国][大陆] @Override public boolean incrementToken() throws IOException { if(!input.incrementToken()) return false; if(cta.toString().equals("中国")){ cta.setEmpty(); cta.append("大陆"); } return true; } */ private boolean getSameWords(String name){ Map<String,String[]> maps = new HashMap<String,String[]>(); maps.put("中国", new String[]{"大陆","天朝"}); maps.put("我", new String[]{"咱","俺"}); String[] sws = maps.get(name); if(sws != null){ for(String s : sws){ sames.push(s); } return true; } return false; } }
package org.lucene.util; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * 打印语汇单元的信息 * @author user * */ public class AnalyzerUtils { public static void displayToken(String str,Analyzer a){ TokenStream stream = a.tokenStream("content", new StringReader(str)); /* * TokenStream相当于一条流 * CharTermAttribute相当于一个碗 * 然后把碗丢进流里面,当碗得到一个元素后,碗又会自动流到了下 * 一个元素进行取值 * 这是一种设计模式:创建一个属性,这个属性会添加流中, * 随着这个TokenStream增加 */ CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); try { while(stream.incrementToken()){ System.out.print("["+cta+"]"); // System.out.println(stream); //如果直接打印Stream的话,toString打印如下: //(来,startOffset=1,endOffset=2,positionIncrement=1,type=<IDEOGRAPHIC>) } System.out.println(); } catch (IOException e) { e.printStackTrace(); } } /** * 打印详细信息的语汇单元 * @param str * @param a */ public static void displayAllToken(String str,Analyzer a){ TokenStream stream = a.tokenStream("content", new StringReader(str)); //位置增量 PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class); //偏移量 OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); //词元 CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); //分词的类型 TypeAttribute ta = stream.addAttribute(TypeAttribute.class); try { while(stream.incrementToken()){ System.out.print(pia.getPositionIncrement()+":"); System.out.print(cta+"["+oa.startOffset()+"-"+ oa.endOffset()+"-"+ta.type()); System.out.println(); } System.out.println(); } catch (IOException e) { e.printStackTrace(); } } }