IKAnalyzer中文分词

阅读更多

/** * @{#} TokenStreamDemo.java Create on 2010-9-16 上午09:12:11 * * Copyright (c) 2010 by beckham. */ package cn.com.test; import java.io.File; import java.io.StringReader; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; /** * @version 1.0 */ public class TokenStreamDemo extends TestCase { // IK分词器 private Analyzer analyzer = new IKAnalyzer(false); private Document document; private IndexWriter writer; private static File indexFile = new File("d:\\index"); public void testTokenStream() throws Exception { // 分词 TokenStream tokenStream = analyzer.reusableTokenStream("text", new StringReader("中华人民共和国")); TermAttribute term = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { System.out.println(term.term()); } // 创建索引 /** * * FSDirectory.open(indexFile) 索引文件存放的路径 * analyzer 分词器 * true表示创建、false表示在此基础上修改 * IndexWriter.MaxFieldLength.LIMITED 表示分词的最大值 * 比如说new MaxFieldLength(2),就表示两个字一分,一般用 * IndexWriter.MaxFieldLength.LIMITED */ writer = new IndexWriter(FSDirectory.open(indexFile), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); document = new Document(); /** * 创建field对象并写入document中 * name:field对象名称(content) * value:field对象的值(中华人民共和国) * store:是否要存储 * index:分词索引 */ document.add(new Field("content", "中华人民共和国", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(document); writer.close(); // 搜索 /** * query对象由IK提供 * */ Query query = IKQueryParser.parse("content", "华人"); // 索引查询器 IndexSearcher searcher = new IndexSearcher(FSDirectory.open(indexFile)); searcher.setSimilarity(new IKSimilarity()); TopDocs docs = searcher.search(query, 10); System.out.println("匹配对象个数:"+docs.totalHits); ScoreDoc[] s = docs.scoreDocs; for (int i = 0; i < s.length; i++) { Document d = searcher.doc(s[i].doc); System.out.println(d.toString()); } searcher.close(); } }

结果如下:

中华人民共和国
中华人民
中华
华人
人民共和国
人民
共和国
共和
匹配对象个数:1
Document>

你可能感兴趣的:(lucene,Apache,junit)