Lucene3.0.0 入门实例

      lucene3.0已于2009-11-25发布啦,但网上的入门实例都是针对lucene3.0以前的,相对于以前的版本,貌似改动不小。 
      本人从头开始学习lucene,现在用的是《lucene in action中文版》,结合lucene3.0文档写了个入门实例,可供像我一样直接从lucene3.0开始学习的新手参考!
 

 

入门实例: 
lucene3.0.0的jar包和《三国演义》电子书的下载网址:lucene3.0.0的学习资料
1.预处理:先把网上下载的一个《三国演义》电子书“三国演义.txt”(可用其他代替,呵呵)切割成多个小文件。
 

package com.taobao.zhujiadun.basic; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; /** * @author ht * 预处理 * */ public class FilePreprocess { public static void main(String[] arg){ String outputpath = "D://test//small//";//小文件存放路径 String filename = "D://test//三国演义.txt";//原文件存放路径 if(!new File(outputpath).exists()){ new File(outputpath).mkdirs(); } splitToSmallFiles(new File(filename), outputpath); } /**大文件切割为小的 * @param file * @param outputpath */ public static void splitToSmallFiles(File file ,String outputpath){ int filePointer = 0; int MAX_SIZE = 10240; String filename = "output"; BufferedWriter writer = null; try { BufferedReader reader = new BufferedReader(new FileReader(file)); StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while(line != null){ buffer.append(line).append("/r/n"); if(buffer.toString().getBytes().length>=MAX_SIZE){ writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); filePointer++; buffer=new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); System.out.println("The file hava splited to small files !"); } catch (FileNotFoundException e) { System.out.println("file not found !"); e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }  

 

2.用lucene3.0生成索引类:用lencene3.0对生成的多个小文件进行索引,中文分词用的是lucene3.0自带的StandardAnalyzer. 

package com.taobao.zhujiadun.basic; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * @author ht * 索引生成 * */ public class Indexer { private static String INDEX_DIR = "D://test//index";//索引存放目录 private static String DATA_DIR = "D://test//small//";//小文件存放的目录 public static void main(String[] args) throws Exception { long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法 long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } /**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 * @param indexDir * @param dataDir * @return int * @throws IOException */ public static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);//有变化的地方 indexDirectory(writer, dataDir); int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /**循环遍历目录下的所有.txt文件并进行索引 * @param writer * @param dir * @throws IOException */ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(".txt")) { indexFile(writer, f); } } } /**对单个txt文件进行索引 * @param writer * @param f * @throws IOException */ private static void indexFile(IndexWriter writer, File f)throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("contents",new FileReader(f)));//有变化的地方 doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方 writer.addDocument(doc); } } 

 

3.查询类:查询“玄德”! 

package com.taobao.zhujiadun.basic; import java.io.File; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * @author ht * 查询 * */ public class Searcher { private static String INDEX_DIR = "D://test//index//";//索引所在的路径 private static String KEYWORD = "玄德";//关键词 private static int TOP_NUM = 100;//显示前100条结果 public static void main(String[] args) throws Exception { File indexDir = new File(INDEX_DIR); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir +" does not exist or is not a directory."); } search(indexDir, KEYWORD);//调用search方法进行查询 } /**查询 * @param indexDir * @param q * @throws Exception */ public static void search(File indexDir, String q) throws Exception { IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir),true);//read-only String field = "contents"; QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方 Query query = parser.parse(q); TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方 long start = new Date().getTime();// start time is.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println(hits.length); for (int i = 0; i < hits.length; i++) { Document doc = is.doc(hits[i].doc);//new method is.doc() System.out.println(doc.getField("filename")+" "+hits[i].toString()+" "); } long end = new Date().getTime();//end time System.out.println("Found " + collector.getTotalHits() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); } } 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

你可能感兴趣的:(搜索引擎)