Paoding's Knives 中文分词具有极 高效率 和 高扩展性 。
引入隐喻,采用完全的面向对象设计,构思先进。
高效率:在PIII 1G内存个人机器上,1秒 可准确分词 100万 汉字。
采用基于 不限制个数 的词典文件对文章进行有效切分,使能够将对词汇分类定义。
能够对未知的词汇进行合理解析。
缺点:官网:http://code.google.com/p/paoding/作者很早已不更新,跟不上Lucene的更新,可以找到支持Lucene4.8.*的Link: https://github.com/cslinmiso/paoding-analysis
package com.wugang.paoding.index; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class TestFileIndex { public static void main(String[] args) throws Exception { String dataDir="data"; String indexDir="luceneindex"; File[] files=new File(dataDir).listFiles(); System.out.println(files.length); Analyzer analyzer=new PaodingAnalyzer(); Directory dir=FSDirectory.open(new File(indexDir)); IndexWriter writer=new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_48,analyzer)); for(int i=0;i<files.length;i++){ StringBuffer strBuffer=new StringBuffer(); String line=""; FileInputStream is=new FileInputStream(files[i].getCanonicalPath()); BufferedReader reader=new BufferedReader(new InputStreamReader(is,"utf8")); line=reader.readLine(); while(line != null){ strBuffer.append(line); strBuffer.append("\n"); line=reader.readLine(); } Document doc=new Document(); doc.add(new Field("fileName", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("contents", strBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); reader.close(); is.close(); } writer.commit(); writer.close(); dir.close(); System.out.println("ok"); } }
package com.wugang.paoding.index; import java.io.File; import java.io.StringReader; import java.util.Set; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.AttributeImpl; public class TestFileSearcher { public static void main(String[] args) throws Exception { String indexDir = "luceneindex"; Analyzer analyzer = new PaodingAnalyzer(); String search_text = "六小龄童的眼睛和耳朵变成小和尚"; StringReader reader = new StringReader(search_text); TokenStream ts = analyzer.tokenStream(search_text, reader); boolean hasnext = ts.incrementToken(); while (hasnext) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); System.out.print(ta.toString() + " "); hasnext = ts.incrementToken(); } ts.close(); Directory dir = FSDirectory.open(new File(indexDir)); DirectoryReader dr = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(dr); QueryParser parser = new QueryParser(Version.LUCENE_48, "contents", analyzer); Query query = parser.parse(search_text); //Term term=new Term("contents", search_text); //TermQuery query=new TermQuery(term); System.out.println("\n"+ query.toString()); TopDocs docs = searcher.search(query, 1000); ScoreDoc[] hits = docs.scoreDocs; System.out.println(hits.length); for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.print(doc.get("fileName") + "--:\n"); System.out.println(doc.get("contents") + "\n"); } // searcher.close(); dir.close(); } }