lucene全文检索学习记录,附带源码——三种实现,超全超细致

Lucene学习记录

———三种实现,超全超细致

下载lucene3.6.0.zip http://download.csdn.net/detail/leilovegege/6800405 ,解压,将里边的lucene-core-3.6.0.jar等包拷贝到工程lib中。还需要中文分词器IKAnalyzer3.2.8.jarojdbc14.jar

至此环境搭配完成。

 

下面开始实现,只用lava类,没连接web页面,所以在测试时执行java文件,在控制台进行测试。

工程原文件lucene36 http://download.csdn.net/detail/leilovegege/6804669

Test1检索绝对路径上的文件

package test1;  

 

import java.io.BufferedReader;  

import java.io.File;  

import java.io.FileInputStream;  

import java.io.FileReader;  

import java.io.IOException;  

import java.io.InputStreamReader;  

import java.util.Date;  

 

import org.apache.lucene.analysis.Analyzer;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  

import org.apache.lucene.document.Document;  

import org.apache.lucene.document.Field;  

import org.apache.lucene.document.FieldSelectorResult;  

import org.apache.lucene.document.NumericField;  

import org.apache.lucene.index.IndexWriter;  

import org.apache.lucene.index.IndexWriterConfig;  

import org.apache.lucene.index.IndexWriterConfig.OpenMode;  

import org.apache.lucene.store.Directory;  

import org.apache.lucene.store.FSDirectory;  

import org.apache.lucene.util.Version;  

 

public class TestFileIndexer {  

    public   static   void  main(String[] args)  throws  Exception  {             

        /*  指明要索引文件夹的位置,这里是C盘的source文件夹下  */          

        File fileDir  =   new  File( "E:\\Documents and Settings\\Administrator\\Workspaces\\MyEclipse 8.6\\lucene36\\source" );    

        /*  这里放索引文件的位置  */         

        File indexDir  =   new  File( ".\\index\\test1" );            

        Directory dir=FSDirectory.open(indexDir);//将索引存放在磁盘上  

        Analyzer lucenAnalyzer=new StandardAnalyzer(Version.LUCENE_36);//分析器  

        IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_36,lucenAnalyzer);  

        iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库  

        IndexWriter indexWriter=new IndexWriter(dir,iwc);//把文档写入到索引库  

        File[] textFiles=fileDir.listFiles();//得到索引文件夹下所有文件  

        long startTime=new Date().getTime();  

        //增加document到检索去  

        for (int i = 0; i < textFiles.length; i++) {  

//          if (textFiles[i].isFile()&& textFiles[i].getName().endsWith(".txt")) {  

                System.out.println(":;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;");  

                System.out.println("File"+textFiles[i].getCanonicalPath()+"正在被索引...");  

                String temp=FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");  

                System.out.println(temp);  

                Document document=new Document();  

                Field FieldPath=new Field("path",textFiles[i].getPath(),Field.Store.YES,Field.Index.NO);  

                Field FieldBody=new Field("body",temp,Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);  

                NumericField modifiField=new NumericField("modified");//所以keymodified  

                modifiField.setLongValue(fileDir.lastModified());  

                document.add(FieldPath);  

                document.add(FieldBody);  

                document.add(modifiField);  

                indexWriter.addDocument(document);  

                  

//          }  

        }  

        indexWriter.close();  

        //计算一下索引的时间  

        long endTime=new Date().getTime();  

        System.out.println("花了"+(endTime-startTime)+"毫秒把文档添加到索引里面去"+fileDir.getPath());  

    }  

    public static String FileReaderAll(String FileName,String charset)throws IOException{  

        BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream(FileName),charset));  

        String line=new String();   

        String temp=new String();  

        while ((line=reader.readLine())!=null) {  

            temp+=line;  

        }  

        reader.close();  

        return temp;  

    }     

 

package test1;

import java.io.BufferedReader;

import java.io.File;  

import java.io.IOException;  

import java.io.InputStreamReader;

import java.util.Scanner;

 

import org.apache.lucene.analysis.Analyzer;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  

import org.apache.lucene.document.Document;  

import org.apache.lucene.index.IndexReader;  

import org.apache.lucene.queryParser.ParseException;  

import org.apache.lucene.queryParser.QueryParser;  

import org.apache.lucene.search.IndexSearcher;  

import org.apache.lucene.search.Query;   

import org.apache.lucene.search.ScoreDoc;  

import org.apache.lucene.search.TopDocs;  

import org.apache.lucene.store.FSDirectory;  

import org.apache.lucene.util.Version;  

 

public class TestQuery {  

 

 

    public static void main(String[] args) throws ParseException, IOException {  

        String index="./index/test1";//搜索的索引路径  

        IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));  

        IndexSearcher searcher=new IndexSearcher(reader);//检索工具  

        ScoreDoc[] hits=null;

//        BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

//        String queryString=reader1.readLine().toString();  //搜索关键字 

//        Scanner sca=new Scanner(System.in);

//        String queryString=sca.next().toString();

        String queryString="测试";

        Query query=null;  

        Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);  

        try {  

            QueryParser qp=new QueryParser(Version.LUCENE_36,"body",analyzer);//用于解析用户输入的工具  

            query=qp.parse(queryString);  

        } catch (Exception e) {  

            // TODO: handle exception  

       }  

        if (searcher!=null) {  

            TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果  

            hits=results.scoreDocs;  

            Document document=null;

            if (hits.length>0) {  

                System.out.println("找到"+hits.length+"条结果");

                for (int i = 0; i < hits.length; i++) {  

                  document=searcher.doc(hits[i].doc);  

                  String body=document.get("body");  

                  String path=document.get("path");  

                  String modifiedtime=document.get("modifiField");  

                  System.out.print(body+"        ");   

                  System.out.println(path);   

              }  

            }else

              System.out.println("没查到结果");

            searcher.close();  

            reader.close();  

        }else

         System.out.println("没查找到索引");  

 

    }  

      

}

 

 

Test2检索相对路径上的文件

package test2;

 

import java.io.*;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Set;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

import org.wltea.analyzer.lucene.IKQueryParser;

 

public class MyLucene {

         private static final File INDEX_PATH = new File(".\\index\\test2");             // 索引文件位置, 当前路径下的index文件

         private static final  String filePath = ".\\luceneDataSource\\test.txt";// 索引数据源文件位置,当前路径下的luceneDataSource\test.txt文件

         private static final Analyzer ANALYZER = new IKAnalyzer();                   // 中文分词器

        

         public static void main(String[] args){

                   /**

                    * 创建索引

                    */

                   File readFile = new File(filePath);                                                                             // 获取数据源文件

                   HashMap<String, String> words = readFile(readFile);

 

                   Document doc = null;

                   if (words != null) {

                                     try {

                                                        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, ANALYZER);

                                                        iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库,没有这句话索引库会有重复的

                                                        IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_PATH), iwc);

                                                        Set<String> keys = words.keySet();

                  

                                                        for (Iterator<String> it = keys.iterator(); it.hasNext();) {

                                                                 String key = it.next();

                                                                 doc = new Document();

                                                                 Field index = new Field("index", key, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

                                                                 Field contents = new Field("contents", words.get(key),Field.Store.YES, Field.Index.NO);

                                                                 doc.add(index);

                                                                 doc.add(contents);

                                                                 writer.addDocument(doc);

                                                        }

                                                        writer.close();      // 这里不关闭,建立索引会失败

                                     } catch (Exception e) {

                                                        e.printStackTrace();

                                     }

                   }

                   else

                                     System.out.println("文件读取错误");

                  

                  

         }

        

        

 

         /**

          * 判断索引库是已否创建

          */

         public boolean noIndex() {

                            File[] indexs = INDEX_PATH.listFiles();

                            if (indexs.length == 0) {

                                     return true;

                            } else {

                                     return false;

                            }

         }

        

         /**

          * 读取文件

          * @param file

          */

         public static HashMap<String, String> readFile(File file) {

                   InputStream in = null;

                   InputStreamReader inR = null;

                   BufferedReader br = null;

                   HashMap<String, String> wordsMap = new HashMap<String, String>();

                   try {

                                     in = new FileInputStream(file);

                                     inR = new InputStreamReader(in, "GBK");   //utf-8

                                     br = new BufferedReader(inR);

                                     String line;

                                     while ((line = br.readLine()) != null) {

                                               System.out.println(line);

                                               wordsMap.put(line.trim(), line.trim());

                                     }

                                     return wordsMap;

 

                   } catch (Exception e) {

                                     e.printStackTrace();

                                     return null;

                   } finally {

                            try {

                                     if (in != null)

                                               in.close();

                                     if (inR != null)

                                               inR.close();

                                     if (br != null)

                                               br.close();

                            } catch (Exception e) {

                                     e.printStackTrace();

                                     return null;

                            }

                   }

         }

        

         /**

          * 检索

          * @param queryStr

          * @param hitsPerPage

          */

         public void search(String queryStr) {

                  

                   try {

                            IndexReader reader = IndexReader.open(FSDirectory.open(INDEX_PATH));// 得到索引的目录

                            IndexSearcher searcher = new IndexSearcher(reader);

 

                            Query query = IKQueryParser.parse("index", queryStr);

                            TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);

                            searcher.search(query, collector);

                            ScoreDoc[] hits = collector.topDocs().scoreDocs;

 

                            if(hits.length > 0){

                                     System.out.println("检索词:"+queryStr+"\t共找到 "+hits.length+"条记录");

                                     for (int i = 0; i < hits.length; i++) {

                                               Document result = searcher.doc(hits[i].doc);

                                               System.out.println((i+1) +")" + "\n  index:" + result.get("index") + "\n  contents:" + result.get("contents"));

                                     }

                            }else{

                                     System.out.println("未找到结果");

                            }

                   } catch (Exception e) {

                            System.out.println("Exception");

                   }

         }

}

 

package test2;

 

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

 

public class TestMyLucene {

         public static void main(String[] args) throws IOException {

                   MyLucene myLucene = new MyLucene();

                   // 索引库是已否创建,如果没有则创建

                   if(myLucene.noIndex()){              

                            System.out.println("索引库还没有创建");

                   }else{

                             BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

                        String queryString=reader1.readLine().toString();  //搜索关键字

                        myLucene.search(queryString);

                   }

         }

}

 

 

Test3检索数据库中的数据(本例为oracle

package test3;

 

import java.io.File;

import java.io.IOException;

import java.sql.Connection;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.text.DateFormat;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.List;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

 

public class IndexCreateUtill {

 

    private List<NewsItem> list;

 

    public void createIndexForMynews() throws IOException, ClassNotFoundException{

        //存放索引的文件夹

        File indxeFile = new File(".\\index\\test3");

        //创建Directory对象

        Directory directory =FSDirectory.open(indxeFile);

        //使用IKAnalyzer分词器

        Analyzer analyzer = new IKAnalyzer();

        //创建IndexWriterConfig

        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);

        //创建IndexWriter

        indexWriterConfig.setOpenMode(OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);

        //从数据库中读取出所有的新闻记录以便进行索引的创建

        try {

//            DBSource dbSource =DBSource.getInstance();

//            Connection conn = dbSource.getConnection();

           Connection conn=Utils.getConnection();

            Statement stmt = null;

            ResultSet rs = null;

            String sql = "select * from t_newsitem";

            stmt = conn.createStatement();

            rs = stmt.executeQuery(sql);

 

            list = new ArrayList<NewsItem>();

            while(rs.next()){

                NewsItem newsItem = new NewsItem();

                newsItem.setId(rs.getInt("id"));

                newsItem.setNewsTitle(rs.getString("newsTitle"));

                newsItem.setNewsContent(rs.getString("newsContent"));

                newsItem.setPublishTime(rs.getTimestamp("publishTime"));

                newsItem.setResource(rs.getString("resourcer"));

                newsItem.setT_newsType_id(rs.getInt("t_newsType_id"));

                newsItem.setEditor(rs.getString("editor"));

                list.add(newsItem);

            }

            DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd HHmmss");

            for (int i=0;i<list.size();i++) {

                //建立一个lucene文档

                Document doc = new Document();

                //得到新闻标题

                String newsTitle = list.get(i).getNewsTitle();

                //得到新闻内容

                String newsContent = list.get(i).getNewsContent();

                //得到新闻事件

                String publishDate = dateFormat.format(list.get(i).getPublishTime());

                //得到新闻主键id

                String id = list.get(i).getId() + "";

                //将新闻标题加入文档,因为要搜索和高亮,所以indextokennizedTermVectorWITH_POSITIONS_OFFSETS

                doc.add(new Field("title" , newsTitle , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

                //添加新闻内容至文档,与标题相似

                doc.add(new Field("content" , newsContent , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

                //添加时间至文档,因为要按照此字段降序排列排序,所以tokenzied,不用高亮所以TermVectorno就行了

                doc.add(new Field("date" , publishDate , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.NO));

                //添加主键至文档,不分词,不高亮。

                doc.add(new Field("id" , id , Field.Store.YES , Field.Index.NO , Field.TermVector.NO));

                indexWriter.addDocument(doc);

            }

            indexWriter.close();

            Utils.closeAll(rs, stmt, conn);

        } catch (SQLException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

    }

    public static void main(String[] args) throws Exception {

        IndexCreateUtill util  = new IndexCreateUtill();

        util.createIndexForMynews();

    }

}

 

package test3;

 

import java.io.Serializable;

import java.util.Date;

 

public class NewsItem implements Serializable{

 

    private static final long serialVersionUID = 1L;

 

    private Integer id ;

 

    private String newsTitle ;

 

    private String newsContent;

 

    private Date publishTime;

 

    private String resource;

 

    private Integer t_newsType_id;

 

    private String editor;

 

    public NewsItem() {

 

    }

 

    public NewsItem(Integer id, String newsTitle, String newsContent,

            Date publishTime, String resource, Integer t_newsType_id, String editor) {

        super();

        this.id = id;

        this.newsTitle = newsTitle;

        this.newsContent = newsContent;

        this.publishTime = publishTime;

        this.resource = resource;

        this.t_newsType_id = t_newsType_id;

        this.editor = editor;

    }

 

    public Integer getId() {

        return id;

    }

 

    public void setId(Integer id) {

        this.id = id;

    }

 

    public String getNewsTitle() {

        return newsTitle;

    }

 

    public void setNewsTitle(String newsTitle) {

        this.newsTitle = newsTitle;

    }

 

    public String getNewsContent() {

        return newsContent;

    }

 

    public void setNewsContent(String newsContent) {

        this.newsContent = newsContent;

    }

 

    public Date getPublishTime() {

        return publishTime;

    }

 

    public void setPublishTime(Date publishTime) {

        this.publishTime = publishTime;

    }

 

    public String getResource() {

        return resource;

    }

 

    public void setResource(String resource) {

        this.resource = resource;

    }

 

    public Integer getT_newsType_id() {

        return t_newsType_id;

    }

 

    public void setT_newsType_id(Integer t_newsType_id) {

        this.t_newsType_id = t_newsType_id;

    }

 

    public String getEditor() {

        return editor;

    }

 

    public void setEditor(String editor) {

        this.editor = editor;

    }

 

}

 

package test3;

import java.io.File;  

import java.io.IOException;  

import java.util.Scanner;

 

import org.apache.lucene.analysis.Analyzer;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  

import org.apache.lucene.document.Document;  

import org.apache.lucene.index.IndexReader;  

import org.apache.lucene.queryParser.ParseException;  

import org.apache.lucene.queryParser.QueryParser;  

import org.apache.lucene.search.IndexSearcher;  

import org.apache.lucene.search.Query;  

import org.apache.lucene.search.ScoreDoc;  

import org.apache.lucene.search.TopDocs;  

import org.apache.lucene.store.FSDirectory;  

import org.apache.lucene.util.Version;  

 

public class TestQuery {  

 

 

    public static void main(String[] args) throws ParseException, IOException {  

        String index=".\\index\\test3";//搜索的索引路径  

        IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));  

        IndexSearcher searcher=new IndexSearcher(reader);//检索工具  

        ScoreDoc[] hits=null;

//        BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

//        String queryString=reader1.readLine().toString();  //搜索关键字 

        Scanner sca=new Scanner(System.in);

        String queryString=sca.next().toString();

        System.out.print("搜索关键词为"+queryString+",");

        Query query=null;  

        Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);  

        try {  

            QueryParser qp=new QueryParser(Version.LUCENE_36,"content",analyzer);//用于解析用户输入的工具  

            query=qp.parse(queryString);  

        } catch (Exception e) {  

            // TODO: handle exception  

       }  

        if (searcher!=null) {  

            TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果  

            hits=results.scoreDocs;  

            Document document=null;

            if (hits.length>0) {  

                System.out.println("找到"+hits.length+"条结果");

                for (int i = 0; i < hits.length; i++) {  

                     document=searcher.doc(hits[i].doc);  

                     String title=document.get("title");

                     String content=document.get("content");

                     String date=document.get("date"); 

                     String id=document.get("id");

                     System.out.println("标题:"+title);

                     System.out.println("内容:"+content);

                     System.out.println("日期:"+date);

                     System.out.println("ID:"+id);   

                }  

            }else

                System.out.println("没查到结果");

            searcher.close();  

            reader.close();  

        }else

           System.out.println("没查找到索引");  

 

    }  

      

}

 

package test3;

 

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

 

public class Utils {

         public static Connection getConnection() {

                   Connection con = null;

                  

                   try {

                            Class.forName("oracle.jdbc.driver.OracleDriver");

                            con = DriverManager.getConnection("jdbc:oracle:thin:@localhost :1521:orcl", "hr", "orcl");

                  

                  

                   }catch (ClassNotFoundException e) {

                            // TODO Auto-generated catch block

                            e.printStackTrace();

                   }

                  

                   catch (SQLException e) {

                            // TODO Auto-generated catch block

                            e.printStackTrace();

                   }

                  

                   return con;

         }

         public static void closeAll(ResultSet rs, Statement ps,Connection conn) throws SQLException{

        closeResultSet(rs);

        closeStatement(ps);

        closeConnection(conn);

    }

    public static void closeConnection(Connection con) {

        try {

          if (con != null) {

            con.close();

          }

        }catch (SQLException ex) {

            ex.printStackTrace();

        }

      }

 

    public static void closeStatement(Statement st) {

        try {

          if (st != null) {

            st.close();

          }

        }catch (SQLException ex) {

            ex.printStackTrace();

        }

      }

 

    public static void closeResultSet(ResultSet rs) {

        try {

          if (rs != null) {

            rs.close();

          }

        }catch (SQLException ex) {

            ex.printStackTrace();

        }

      }

 

}


你可能感兴趣的:(Lucene,全文检索,lucene源码,lucene学习记录)