搜索引擎luence之目录索引

/*
 * Created on 2004-11-20
 *
 * index a dir file
 */
package demo;

/**使用lucence生成目录索引
 *
 */
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.BufferedReader;
import java.io.FileReader;

import java.io.File;

public class DirSpider {
    private String indexDir;
    private String sSourceDir;
    //控制显示配置参数
    private boolean verbose;
    //控制是否增量索引
    private boolean incremental;

    private IndexWriter index;
    //从主函数的参数中得到所需配置信息
	public static void main(String[] args) {
		try {
			DirSpider s = new DirSpider(args);
			s.go();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
    public DirSpider(String argv[]) throws Exception  {
        verbose = false;
        incremental = false;
        
        for (int i = 0; i < argv.length; i++) {
            if (argv[i].equals("-s"))
            	sSourceDir = argv[++i];
            if(argv[i].equals("-d")) 
                indexDir = argv[++i];
            else if(argv[i].equals("-v"))
                verbose = true;
            else if(argv[i].equals("-a"))
                incremental = true;
        }
        
        if (sSourceDir == null)
            throw new IllegalArgumentException("Missing required argument: -s [SourceDir dir]");
        
        if (indexDir == null)
            throw new IllegalArgumentException("Missing required argument: -d [index dir]");
    }

    public void go() throws Exception  {
		long start = System.currentTimeMillis();		
		
        // create the index directory -- or append to existing
        if (verbose) {
			System.out.println("Creating index in: " + indexDir);
            if (incremental) System.out.println("    - using incremental mode");
        }
        index = new IndexWriter(new File(indexDir), new StandardAnalyzer(),
                                !incremental);
        
        File dir = new File(sSourceDir);
        
        indexDir(dir);
        
        index.optimize();
        index.close();
        if(verbose)
        	System.out.println("index complete in :"+(System.currentTimeMillis() - start)/1000);
    }
    
    private void indexDir(File dir)
    {
        File[] files = dir.listFiles();

        for (int i = 0; i < files.length; i++) {
          File f = files[i];
          if (f.isDirectory()) {
            indexDir(f);  // recurse 递归调用
          } else if (f.getName().endsWith(".txt")) {//现在只对txt文本文件索引
            indexFile(f);
          }
        }
    }

    private void indexFile(File item) {
        if (verbose) System.out.println("Adding FILE: " + item);
        
        News news = loadFile(item);

        if ( news!= null && news.body != null) {
            Document doc = new Document();
            Field f = new Field("url", news.URL , 
                    Field.Store.YES, Field.Index.UN_TOKENIZED,
                    Field.TermVector.NO);
			doc.add(f);
			
			f = new Field("title", news.title , 
                    Field.Store.YES, Field.Index.TOKENIZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS);
			doc.add(f);
			
			f = new Field("content", news.body.toString() , 
                    Field.Store.YES, Field.Index.TOKENIZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS);
			doc.add(f);
			System.out.println(news);
			//要清晰异常是什么,该怎样去处理。
            try{
            	index.addDocument(doc);
            }
            catch(Exception e)
			{
            	e.printStackTrace();
            	//System.exit(0)和System.exit(1)分别表示正常退出和异常退出
            	System.exit(-1);
            }
        }else{
        	System.out.println("索引数据为空!");
        }
    }
    
    private static News loadFile(File sSourceFile){
    	News news = new News();
		
		try
		{	//注意附加URL的方式
			//news.URL = "http://www.lietu.com/segtest/"+sSourceFile;
			news.URL="http://localhost:8080/Chapter2WebPart/"+sSourceFile;
			
			BufferedReader br = new BufferedReader(new FileReader(sSourceFile));
			String s;
			
		    if ( (s = br.readLine()) != null )
		    {
		    	news.title = s;
		    	System.out.println(s);
				while( (s = br.readLine()) != null ) {
					news.body.append(s);
					news.body.append('\n');
				}
		    }
		    br.close();
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
		
		return news;
    }
}
class News {
	public String URL;
	public String title;
	public StringBuffer body;

	public News(){
		this.URL = "";
		this.title = "";
		this.body = new StringBuffer();
	}
	
	public String toString(){
		return "URL :"+URL+" title :" + title +" body :"+ body.toString() ;
	}	
}
package com.lietu.web.test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import com.lietu.web.bean.example.Article;

/**
 * 根据luence索引搜索数据
 *
 */
public class MySearche {
	
	public MySearche() {
		super();
	}

	public static void main(String[] args) {
		
		new MySearche().indexSeaches("中");
	}
	// 索引查询
	public Collection indexSeaches(String str) {
		System.out.println("str:" + str);
		Collection collections = new ArrayList();
		try {
			// 创建搜索对象
			IndexSearcher indexSearcher = new IndexSearcher("C:/index");
			// 创建索引读取对象
			IndexReader reader = IndexReader.open("C:/index");
			Query titleQuery = null;
			// 创建查询分析器,对File属性title进行查询,采用的分析器是StandardAnalyzer
			QueryParser queryparser = new QueryParser("title",
					new StandardAnalyzer());
			try {
				// 分析用户输入的字符串
				Query query = queryparser.parse(str);
				// 根据用户的输入开始搜索
				Hits hit = indexSearcher.search(query);
				System.out.println("hit.length():" + hit.length());
				for (int i = 0; i < hit.length(); i++) {
					Article article = new Article();
					// 得到title
					article.setTitle(hit.doc(i).get("title"));
					// 得到content
					article.setContent(hit.doc(i).get("content"));
					// 把document添加到集合中,并且返回
					collections.add(article);

					System.out.println("标题:" + hit.doc(i).get("title"));
					System.out.println("内容:" + hit.doc(i).get("content"));
				}
			} catch (ParseException e) {

				e.printStackTrace();
			}
		} catch (CorruptIndexException e) {

		} catch (IOException e) {

			e.printStackTrace();
		}
		return collections;
	}
}


你可能感兴趣的:(搜索引擎luence之目录索引)