裴东辉-使用lucene集成IKIKAnalyzer实现分词并建立索引

 

 
基本环境:

 ext_stopword.dic和IKAnalyzer.cfg.xml放到classpath下面

 引入jar包:IKAnalyzer3.2.5Stable.jar   lucene-core-3.5.0.jar

 基本程序:

1、中文分词建立索引

package testlucene.index;
 
import java.io.File;
 
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
 
import testlucene.util.FileUtil;
 
//import org.apache.lucene.search.IndexSearcher;
 
public class TxtFileIndexer {
 
public static void main(String args[]) throws Exception {
 
//索引位置
   File indexDir = new File("E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index"); 
   //数据位置
   File dataDir  = new File("E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/data"); 
   
   
   //使用IKAnalyzer中文分词工具
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,new IKAnalyzer());
FSDirectory directory = FSDirectory.open(indexDir);
IndexWriter indexWriter = new IndexWriter(directory, iwc);
   
        //接着程序遍历了目录下面的所有文本文档,并为每一个文本文档创建了一个 Document 对象。
        //然后把文本文档的两个属性:路径和内容加入到了两个 Field 对象中,接着在把这两个 Field 对象加入到 Document 对象中
        //最后把这个文档用 IndexWriter 类的 add 方法加入到索引中去。
        //这样我们便完成了索引的创建。接下来我们进入在建立好的索引上进行搜索的部分。
   File[] dataFiles  = dataDir.listFiles(); 
   String name="",path="",content="";
for(File file:dataFiles){
if(file.isFile() && file.getName().endsWith(".txt")){
System.out.println("Indexing file " + file.getCanonicalPath()); 
 
/*Step 1. Prepare the data for indexing. Extract the data. */
name =file.getName();
                path=file.getCanonicalPath();
content=FileUtil.parsefiletostring(file);
 
                 /*Step 2. Wrap the data in the Fields and add them to a Document */
Document doc = new Document(); doc.add(new Field("name",name,Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("path",path,Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",content,Field.Store.NO,Field.Index.ANALYZED));
 
                 /*Step 3: Add this document to Lucene Index.*/
indexWriter.addDocument(doc);
}
}
indexWriter.close();
}
}
 
 2、中文分词 搜索

  
package testlucene.search;
 
import java.io.File;
 
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
@SuppressWarnings("resource")
public class TxtFileSearcher {
 
public static void main(String[] args) {
try{
//index索引位置生成Reader流
   File indexDir = new File("E:/eclipse_research/fetchnews/ILucene-3.5.0/src/lucenesource/index"); 
   FSDirectory directory = FSDirectory.open(indexDir);
IndexReader reader = IndexReader.open(directory);
//创建Searcher
IndexSearcher indexSearcher = new IndexSearcher(reader);
indexSearcher.setSimilarity(new IKSimilarity());
//查询关键词
String keyWords = "是一个开放源代码的全文检索引擎工具包 ";
//IKAnalyzer中文分词生成查询
Query query = IKQueryParser.parse("content", keyWords);
TopDocs topDocs = indexSearcher.search(query, Integer.MAX_VALUE);
System.out.println(topDocs.totalHits);
//对获取到的文档进行解析输出
       ScoreDoc[] scoreDosArray = topDocs.scoreDocs;
       for(ScoreDoc scoredoc: scoreDosArray){
         Document doc = indexSearcher.doc(scoredoc.doc);
         System.out.println("name: "+doc.getFieldable("name").stringValue());
         System.out.println("path: "+doc.getFieldable("path").stringValue());
       }
}catch(Exception e){
e.printStackTrace();
}
}
}


3、工具类

 
package testlucene.util;
 
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
 
public class FileUtil {
 
/**
* parse the file to string 
* @param file
* @return
*/
public static String parsefiletostring(File file) {
String strresult="",tmp="";
BufferedReader br=null;
try{
br=new BufferedReader(new FileReader(file));
while((tmp=br.readLine())!=null){
strresult+=tmp;
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(null!=br){
try {br.close();} catch (IOException e) {e.printStackTrace();}
}
}
System.out.println(strresult);
return strresult;
}
 
}

你可能感兴趣的:(IKAnalyzer)