Lucene学习记录
———三种实现,超全超细致
下载lucene3.6.0.zip http://download.csdn.net/detail/leilovegege/6800405 ,解压,将里边的lucene-core-3.6.0.jar等包拷贝到工程lib中。还需要中文分词器IKAnalyzer3.2.8.jar,ojdbc14.jar
至此环境搭配完成。
下面开始实现,只用lava类,没连接web页面,所以在测试时执行java文件,在控制台进行测试。
工程原文件lucene36 http://download.csdn.net/detail/leilovegege/6804669
Test1检索绝对路径上的文件
package test1;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class TestFileIndexer {
public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是C盘的source文件夹下 */
File fileDir = new File( "E:\\Documents and Settings\\Administrator\\Workspaces\\MyEclipse 8.6\\lucene36\\source" );
/* 这里放索引文件的位置 */
File indexDir = new File( ".\\index\\test1" );
Directory dir=FSDirectory.open(indexDir);//将索引存放在磁盘上
Analyzer lucenAnalyzer=new StandardAnalyzer(Version.LUCENE_36);//分析器
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_36,lucenAnalyzer);
iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库
IndexWriter indexWriter=new IndexWriter(dir,iwc);//把文档写入到索引库
File[] textFiles=fileDir.listFiles();//得到索引文件夹下所有文件
long startTime=new Date().getTime();
//增加document到检索去
for (int i = 0; i < textFiles.length; i++) {
// if (textFiles[i].isFile()&& textFiles[i].getName().endsWith(".txt")) {
System.out.println(":;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;");
System.out.println("File"+textFiles[i].getCanonicalPath()+"正在被索引...");
String temp=FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
System.out.println(temp);
Document document=new Document();
Field FieldPath=new Field("path",textFiles[i].getPath(),Field.Store.YES,Field.Index.NO);
Field FieldBody=new Field("body",temp,Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
NumericField modifiField=new NumericField("modified");//所以key为modified
modifiField.setLongValue(fileDir.lastModified());
document.add(FieldPath);
document.add(FieldBody);
document.add(modifiField);
indexWriter.addDocument(document);
// }
}
indexWriter.close();
//计算一下索引的时间
long endTime=new Date().getTime();
System.out.println("花了"+(endTime-startTime)+"毫秒把文档添加到索引里面去"+fileDir.getPath());
}
public static String FileReaderAll(String FileName,String charset)throws IOException{
BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream(FileName),charset));
String line=new String();
String temp=new String();
while ((line=reader.readLine())!=null) {
temp+=line;
}
reader.close();
return temp;
}
}
package test1;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Scanner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class TestQuery {
public static void main(String[] args) throws ParseException, IOException {
String index="./index/test1";//搜索的索引路径
IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher=new IndexSearcher(reader);//检索工具
ScoreDoc[] hits=null;
// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));
// String queryString=reader1.readLine().toString(); //搜索关键字
// Scanner sca=new Scanner(System.in);
// String queryString=sca.next().toString();
String queryString="测试";
Query query=null;
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);
try {
QueryParser qp=new QueryParser(Version.LUCENE_36,"body",analyzer);//用于解析用户输入的工具
query=qp.parse(queryString);
} catch (Exception e) {
// TODO: handle exception
}
if (searcher!=null) {
TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果
hits=results.scoreDocs;
Document document=null;
if (hits.length>0) {
System.out.println("找到"+hits.length+"条结果");
for (int i = 0; i < hits.length; i++) {
document=searcher.doc(hits[i].doc);
String body=document.get("body");
String path=document.get("path");
String modifiedtime=document.get("modifiField");
System.out.print(body+" ");
System.out.println(path);
}
}else
System.out.println("没查到结果");
searcher.close();
reader.close();
}else
System.out.println("没查找到索引");
}
}
Test2检索相对路径上的文件
package test2;
import java.io.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
public class MyLucene {
private static final File INDEX_PATH = new File(".\\index\\test2"); // 索引文件位置, 当前路径下的index文件
private static final String filePath = ".\\luceneDataSource\\test.txt";// 索引数据源文件位置,当前路径下的luceneDataSource\test.txt文件
private static final Analyzer ANALYZER = new IKAnalyzer(); // 中文分词器
public static void main(String[] args){
/**
* 创建索引
*/
File readFile = new File(filePath); // 获取数据源文件
HashMap<String, String> words = readFile(readFile);
Document doc = null;
if (words != null) {
try {
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, ANALYZER);
iwc.setOpenMode(OpenMode.CREATE);//创建新的索引文件create 表示创建或追加到已有索引库,没有这句话索引库会有重复的
IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_PATH), iwc);
Set<String> keys = words.keySet();
for (Iterator<String> it = keys.iterator(); it.hasNext();) {
String key = it.next();
doc = new Document();
Field index = new Field("index", key, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
Field contents = new Field("contents", words.get(key),Field.Store.YES, Field.Index.NO);
doc.add(index);
doc.add(contents);
writer.addDocument(doc);
}
writer.close(); // 这里不关闭,建立索引会失败
} catch (Exception e) {
e.printStackTrace();
}
}
else
System.out.println("文件读取错误");
}
/**
* 判断索引库是已否创建
*/
public boolean noIndex() {
File[] indexs = INDEX_PATH.listFiles();
if (indexs.length == 0) {
return true;
} else {
return false;
}
}
/**
* 读取文件
* @param file
*/
public static HashMap<String, String> readFile(File file) {
InputStream in = null;
InputStreamReader inR = null;
BufferedReader br = null;
HashMap<String, String> wordsMap = new HashMap<String, String>();
try {
in = new FileInputStream(file);
inR = new InputStreamReader(in, "GBK"); //utf-8
br = new BufferedReader(inR);
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
wordsMap.put(line.trim(), line.trim());
}
return wordsMap;
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
try {
if (in != null)
in.close();
if (inR != null)
inR.close();
if (br != null)
br.close();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
/**
* 检索
* @param queryStr
* @param hitsPerPage
*/
public void search(String queryStr) {
try {
IndexReader reader = IndexReader.open(FSDirectory.open(INDEX_PATH));// 得到索引的目录
IndexSearcher searcher = new IndexSearcher(reader);
Query query = IKQueryParser.parse("index", queryStr);
TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
if(hits.length > 0){
System.out.println("检索词:"+queryStr+"\t共找到 "+hits.length+"条记录");
for (int i = 0; i < hits.length; i++) {
Document result = searcher.doc(hits[i].doc);
System.out.println((i+1) +")" + "\n index:" + result.get("index") + "\n contents:" + result.get("contents"));
}
}else{
System.out.println("未找到结果");
}
} catch (Exception e) {
System.out.println("Exception");
}
}
}
package test2;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public class TestMyLucene {
public static void main(String[] args) throws IOException {
MyLucene myLucene = new MyLucene();
// 索引库是已否创建,如果没有则创建
if(myLucene.noIndex()){
System.out.println("索引库还没有创建");
}else{
BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));
String queryString=reader1.readLine().toString(); //搜索关键字
myLucene.search(queryString);
}
}
}
Test3检索数据库中的数据(本例为oracle)
package test3;
import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IndexCreateUtill {
private List<NewsItem> list;
public void createIndexForMynews() throws IOException, ClassNotFoundException{
//存放索引的文件夹
File indxeFile = new File(".\\index\\test3");
//创建Directory对象
Directory directory =FSDirectory.open(indxeFile);
//使用IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer();
//创建IndexWriterConfig
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
//创建IndexWriter
indexWriterConfig.setOpenMode(OpenMode.CREATE);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
//从数据库中读取出所有的新闻记录以便进行索引的创建
try {
// DBSource dbSource =DBSource.getInstance();
// Connection conn = dbSource.getConnection();
Connection conn=Utils.getConnection();
Statement stmt = null;
ResultSet rs = null;
String sql = "select * from t_newsitem";
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
list = new ArrayList<NewsItem>();
while(rs.next()){
NewsItem newsItem = new NewsItem();
newsItem.setId(rs.getInt("id"));
newsItem.setNewsTitle(rs.getString("newsTitle"));
newsItem.setNewsContent(rs.getString("newsContent"));
newsItem.setPublishTime(rs.getTimestamp("publishTime"));
newsItem.setResource(rs.getString("resourcer"));
newsItem.setT_newsType_id(rs.getInt("t_newsType_id"));
newsItem.setEditor(rs.getString("editor"));
list.add(newsItem);
}
DateFormat dateFormat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒");
for (int i=0;i<list.size();i++) {
//建立一个lucene文档
Document doc = new Document();
//得到新闻标题
String newsTitle = list.get(i).getNewsTitle();
//得到新闻内容
String newsContent = list.get(i).getNewsContent();
//得到新闻事件
String publishDate = dateFormat.format(list.get(i).getPublishTime());
//得到新闻主键id
String id = list.get(i).getId() + "";
//将新闻标题加入文档,因为要搜索和高亮,所以index是tokennized,TermVector是WITH_POSITIONS_OFFSETS
doc.add(new Field("title" , newsTitle , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));
//添加新闻内容至文档,与标题相似
doc.add(new Field("content" , newsContent , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));
//添加时间至文档,因为要按照此字段降序排列排序,所以tokenzied,不用高亮所以TermVector是no就行了
doc.add(new Field("date" , publishDate , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.NO));
//添加主键至文档,不分词,不高亮。
doc.add(new Field("id" , id , Field.Store.YES , Field.Index.NO , Field.TermVector.NO));
indexWriter.addDocument(doc);
}
indexWriter.close();
Utils.closeAll(rs, stmt, conn);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
IndexCreateUtill util = new IndexCreateUtill();
util.createIndexForMynews();
}
}
package test3;
import java.io.Serializable;
import java.util.Date;
public class NewsItem implements Serializable{
private static final long serialVersionUID = 1L;
private Integer id ;
private String newsTitle ;
private String newsContent;
private Date publishTime;
private String resource;
private Integer t_newsType_id;
private String editor;
public NewsItem() {
}
public NewsItem(Integer id, String newsTitle, String newsContent,
Date publishTime, String resource, Integer t_newsType_id, String editor) {
super();
this.id = id;
this.newsTitle = newsTitle;
this.newsContent = newsContent;
this.publishTime = publishTime;
this.resource = resource;
this.t_newsType_id = t_newsType_id;
this.editor = editor;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getNewsTitle() {
return newsTitle;
}
public void setNewsTitle(String newsTitle) {
this.newsTitle = newsTitle;
}
public String getNewsContent() {
return newsContent;
}
public void setNewsContent(String newsContent) {
this.newsContent = newsContent;
}
public Date getPublishTime() {
return publishTime;
}
public void setPublishTime(Date publishTime) {
this.publishTime = publishTime;
}
public String getResource() {
return resource;
}
public void setResource(String resource) {
this.resource = resource;
}
public Integer getT_newsType_id() {
return t_newsType_id;
}
public void setT_newsType_id(Integer t_newsType_id) {
this.t_newsType_id = t_newsType_id;
}
public String getEditor() {
return editor;
}
public void setEditor(String editor) {
this.editor = editor;
}
}
package test3;
import java.io.File;
import java.io.IOException;
import java.util.Scanner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class TestQuery {
public static void main(String[] args) throws ParseException, IOException {
String index=".\\index\\test3";//搜索的索引路径
IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher=new IndexSearcher(reader);//检索工具
ScoreDoc[] hits=null;
// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));
// String queryString=reader1.readLine().toString(); //搜索关键字
Scanner sca=new Scanner(System.in);
String queryString=sca.next().toString();
System.out.print("搜索关键词为"+queryString+",");
Query query=null;
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);
try {
QueryParser qp=new QueryParser(Version.LUCENE_36,"content",analyzer);//用于解析用户输入的工具
query=qp.parse(queryString);
} catch (Exception e) {
// TODO: handle exception
}
if (searcher!=null) {
TopDocs results=searcher.search(query, 10);//只取排名前十的搜索结果
hits=results.scoreDocs;
Document document=null;
if (hits.length>0) {
System.out.println("找到"+hits.length+"条结果");
for (int i = 0; i < hits.length; i++) {
document=searcher.doc(hits[i].doc);
String title=document.get("title");
String content=document.get("content");
String date=document.get("date");
String id=document.get("id");
System.out.println("标题:"+title);
System.out.println("内容:"+content);
System.out.println("日期:"+date);
System.out.println("ID:"+id);
}
}else
System.out.println("没查到结果");
searcher.close();
reader.close();
}else
System.out.println("没查找到索引");
}
}
package test3;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class Utils {
public static Connection getConnection() {
Connection con = null;
try {
Class.forName("oracle.jdbc.driver.OracleDriver");
con = DriverManager.getConnection("jdbc:oracle:thin:@localhost :1521:orcl", "hr", "orcl");
}catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return con;
}
public static void closeAll(ResultSet rs, Statement ps,Connection conn) throws SQLException{
closeResultSet(rs);
closeStatement(ps);
closeConnection(conn);
}
public static void closeConnection(Connection con) {
try {
if (con != null) {
con.close();
}
}catch (SQLException ex) {
ex.printStackTrace();
}
}
public static void closeStatement(Statement st) {
try {
if (st != null) {
st.close();
}
}catch (SQLException ex) {
ex.printStackTrace();
}
}
public static void closeResultSet(ResultSet rs) {
try {
if (rs != null) {
rs.close();
}
}catch (SQLException ex) {
ex.printStackTrace();
}
}
}