Lucene3.0实例

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

/**
 * @author ht 预处理
 *
 */
public class FilePreprocess {
 public static void main(String[] arg) {
  String outputpath = "E:\\lucenetest\\small\\";// 小文件存放路径
  String filename = "E:\\lucenetest\\三国演义.txt";// 原文件存放路径
  if (!new File(outputpath).exists()) {
   new File(outputpath).mkdirs();
  }
  splitToSmallFiles(new File(filename), outputpath);
 }

 /**
  * 大文件切割为小的
  *
  * @param file
  * @param outputpath
  */
 public static void splitToSmallFiles(File file, String outputpath) {
  int filePointer = 0;
  int MAX_SIZE = 10240;
  String filename = "output";

 BufferedWriter writer = null;
  try {
   BufferedReader reader = new BufferedReader(new FileReader(file));
   StringBuffer buffer = new StringBuffer();
   String line = reader.readLine();
   while (line != null) {
    buffer.append(line).append("\r\n");
    if (buffer.toString().getBytes().length >= MAX_SIZE) {
     writer = new BufferedWriter(new FileWriter(outputpath
       + filename + filePointer + ".txt"));
     writer.write(buffer.toString());
     writer.close();
     filePointer++;
     buffer = new StringBuffer();
    }
    line = reader.readLine();
   }
   writer = new BufferedWriter(new FileWriter(outputpath + filename
     + filePointer + ".txt"));
   writer.write(buffer.toString());
   writer.close();
   System.out.println("The file hava splited to small files !");
  } catch (FileNotFoundException e) {
   System.out.println("file not found !");
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
 }
}

package com.uphenan.lucene.test;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**  
* @author ht  
* 索引生成  
*  
*/  
public class Indexer {  
   private static String INDEX_DIR = "E:\\lucenetest\\index";//索引存放目录  
   private static String DATA_DIR = "E:\\lucenetest\\small\\";//小文件存放的目录  
      
  public static void main(String[] args) throws Exception {  
    
    long start = new Date().getTime();  
    int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法  
    long end = new Date().getTime();  
    System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");  
  }  
  
  /**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量  
* @param indexDir  
* @param dataDir  
* @return int  
* @throws IOException  
*/  
public static int index(File indexDir, File dataDir) throws IOException {  
  
    if (!dataDir.exists() || !dataDir.isDirectory()) {  
      throw new IOException(dataDir + " does not exist or is not a directory");  
    }  
  
    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true,    
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方  
      
    indexDirectory(writer, dataDir);  
    int numIndexed = writer.numDocs();  
    writer.optimize();  
    writer.close();  
    return numIndexed;  
  }  
  
  /**循环遍历目录下的所有.txt文件并进行索引  
* @param writer  
* @param dir  
* @throws IOException  
*/  
private static void indexDirectory(IndexWriter writer, File dir)  
    throws IOException {  
  
    File[] files = dir.listFiles();  
  
    for (int i = 0; i < files.length; i++) {  
      File f = files[i];  
      if (f.isDirectory()) {  
        indexDirectory(writer, f);  // recurse  
      } else if (f.getName().endsWith(".txt")) {  
        indexFile(writer, f);  
      }  
    }  
  }  
  
  /**对单个txt文件进行索引  
* @param writer  
* @param f  
* @throws IOException  
*/  
private static void indexFile(IndexWriter writer, File f)  
    throws IOException {  
      
    if (f.isHidden() || !f.exists() || !f.canRead()) {  
      return;  
    }  
  
    System.out.println("Indexing " + f.getCanonicalPath());  
    Document doc = new Document();  
    doc.add(new Field("contents",new FileReader(f)));//有变化的地方  
    doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方  
    
    writer.addDocument(doc);  
  }  
}  


package com.uphenan.lucene.test;

import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**  
* @author ht  
* 查询  
*  
*/  
public class Searcher {  
   private static String INDEX_DIR = "E:\\lucenetest\\index\\";//索引所在的路径  
   private static String KEYWORD = "诸葛亮";//关键词  
   private static int TOP_NUM = 100;//显示前100条结果  
      
  public static void main(String[] args) throws Exception {  
    File indexDir = new File(INDEX_DIR);  
    if (!indexDir.exists() || !indexDir.isDirectory()) {  
      throw new Exception(indexDir +  
        " does not exist or is not a directory.");  
    }  
    search(indexDir, KEYWORD);//调用search方法进行查询  
  }  
/**查询  
* @param indexDir  
* @param q  
* @throws Exception  
*/  
  public static void search(File indexDir, String q) throws Exception {  
    IndexSearcher is = new  IndexSearcher(FSDirectory.open(indexDir),true);//read-only  
    String field = "contents";  
      
    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方  
    Query query = parser.parse(q);  
  
    TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方  
      
    long start = new Date().getTime();// start time  
      
    is.search(query, collector);  
    ScoreDoc[] hits = collector.topDocs().scoreDocs;  
  
    System.out.println(hits.length);  
    for (int i = 0; i < hits.length; i++) {  
        Document doc = is.doc(hits[i].doc);//new method is.doc()  
        System.out.println(doc.getField("filename")+"   "+hits[i].toString()+"  ");  
    }  
    long end = new Date().getTime();//end time  
  
    System.out.println("Found " + collector.getTotalHits() +  
              " document(s) (in " + (end - start) +  
              " milliseconds) that matched query '" +  
                q + "':");  
  }  
}  

Lucene Results:
43
stored,indexed,tokenized   doc=95 score=0.20325354  
stored,indexed,tokenized   doc=93 score=0.18696608  
stored,indexed,tokenized   doc=83 score=0.17826515  
stored,indexed,tokenized   doc=91 score=0.1527987  
stored,indexed,tokenized   doc=85 score=0.14914733  
stored,indexed,tokenized   doc=90 score=0.13808359  
stored,indexed,tokenized   doc=41 score=0.1260525  
stored,indexed,tokenized   doc=48 score=0.1260525  
stored,indexed,tokenized   doc=50 score=0.1260525  
stored,indexed,tokenized   doc=82 score=0.1260525  
stored,indexed,tokenized   doc=86 score=0.1260525  
stored,indexed,tokenized   doc=88 score=0.1260525  
stored,indexed,tokenized   doc=89 score=0.1260525  
stored,indexed,tokenized   doc=92 score=0.1260525  
stored,indexed,tokenized   doc=37 score=0.11274478  
stored,indexed,tokenized   doc=38 score=0.11274478  
stored,indexed,tokenized   doc=49 score=0.11274478  
stored,indexed,tokenized   doc=96 score=0.11274478  
stored,indexed,tokenized   doc=42 score=0.097639844  
stored,indexed,tokenized   doc=55 score=0.097639844  
stored,indexed,tokenized   doc=64 score=0.097639844  
stored,indexed,tokenized   doc=94 score=0.09663838  
stored,indexed,tokenized   doc=35 score=0.0797226  
stored,indexed,tokenized   doc=46 score=0.0797226  
stored,indexed,tokenized   doc=52 score=0.0797226  
stored,indexed,tokenized   doc=53 score=0.0797226  
stored,indexed,tokenized   doc=61 score=0.0797226  
stored,indexed,tokenized   doc=71 score=0.0797226  
stored,indexed,tokenized   doc=79 score=0.0797226  
stored,indexed,tokenized   doc=84 score=0.0797226  
stored,indexed,tokenized   doc=99 score=0.0797226  
stored,indexed,tokenized   doc=102 score=0.0797226  
stored,indexed,tokenized   doc=36 score=0.05637239  
stored,indexed,tokenized   doc=45 score=0.05637239  
stored,indexed,tokenized   doc=47 score=0.05637239  
stored,indexed,tokenized   doc=56 score=0.05637239  
stored,indexed,tokenized   doc=59 score=0.05637239  
stored,indexed,tokenized   doc=62 score=0.05637239  
stored,indexed,tokenized   doc=67 score=0.05637239  
stored,indexed,tokenized   doc=75 score=0.05637239  
stored,indexed,tokenized   doc=100 score=0.05637239  
stored,indexed,tokenized   doc=101 score=0.05637239  
stored,indexed,tokenized   doc=103 score=0.05637239  
Found 43 document(s) (in 47 milliseconds) that matched query '诸葛亮':

 

你可能感兴趣的:(apache,F#,Lucene)