参考网页:http://footman265.iteye.com/blog/849744
搞了一天半,终于利用lucene工具Demo完了我想要的功能,这其中包括为数据库建立增量索引、从索引文件根据id删除索引、单字段查询功能、多字段查询功能、多条件查询功能以及查询结果关键字高亮显示的功能。今天晚些的时候把这些功能进行了整理。看样子一时半会还下不了班,就把Demo的结果 一 一 列举下来吧。。。
理论参考:http://lianj-lee.iteye.com/category/69005?show_full=true
Lucene3.0对数据库建立索引:http://269181927.iteye.com/blog/789779
1. 所需要的文件(见附件)
依赖包:
lucene-core-2.4.0.jar lucene工具包
lucene-highlighter-2.4.0.jar 高亮显示工具包
IKAnalyzer2.0.2OBF.jar 分词工具(支持字典分词)
mysql-connector-java-5.0.3-bin 链接mysql驱动
数据表:
pd_ugc.sql(所在数据库为lucenetest)
类文件:
在附件index.rar和test.rar,解压后放入java工程中的src下即可
2. 为数据库建立增量索引
参考网页:http://www.blogjava.net/laoding/articles/279230.html
package index; //--------------------- Change Logs---------------------- // <p>@author zhiqiang.zhang Initial Created at 2010-12-23<p> //------------------------------------------------------- import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.Statement; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; //增量索引 /* * 实现思路:首次查询数据库表所有记录,对每条记录建立索引,并将最后一条记录的id存储到storeId.txt文件中 * 当新插入一条记录时,再建立索引时不必再对所有数据重新建一遍索引, * 可根据存放在storeId.txt文件中的id查出新插入的数据,只对新增的数据新建索引,并把新增的索引追加到原来的索引文件中 * */ public class IncrementIndex { public static void main(String[] args) { try { IncrementIndex index = new IncrementIndex(); String path = "E:\\workspace2\\Test\\lucene_test\\poiIdext";//索引文件的存放路径 String storeIdPath = "E:\\workspace2\\Test\\lucene_test\\storeId.txt";//存储ID的路径 String storeId = ""; Date date1 = new Date(); storeId = index.getStoreId(storeIdPath); ResultSet rs = index.getResult(storeId); System.out.println("开始建立索引。。。。"); index.indexBuilding(path, storeIdPath, rs); Date date2 = new Date(); System.out.println("耗时:"+(date2.getTime()-date1.getTime())+"ms"); storeId = index.getStoreId(storeIdPath); System.out.println(storeId);//打印出这次存储起来的ID } catch (Exception e) { e.printStackTrace(); } } public static void buildIndex(String indexFile, String storeIdFile) { try { String path = indexFile;//索引文件的存放路径 String storeIdPath = storeIdFile;//存储ID的路径 String storeId = ""; storeId = getStoreId(storeIdPath); ResultSet rs = getResult(storeId); indexBuilding(path, storeIdPath, rs); storeId = getStoreId(storeIdPath); System.out.println(storeId);//打印出这次存储起来的ID } catch (Exception e) { e.printStackTrace(); } } public static ResultSet getResult(String storeId) throws Exception { Class.forName("com.mysql.jdbc.Driver").newInstance(); String url = "jdbc:mysql://localhost:3306/lucenetest"; String userName = "root"; String password = "****"; Connection conn = DriverManager.getConnection(url, userName, password); Statement stmt = conn.createStatement(); String sql = "select * from pd_ugc"; ResultSet rs = stmt.executeQuery(sql + " where id > '" + storeId + "'order by id"); return rs; } public static boolean indexBuilding(String path, String storeIdPath, ResultSet rs) { try { Analyzer luceneAnalyzer = new StandardAnalyzer(); // 取得存储起来的ID,以判定是增量索引还是重新索引 boolean isEmpty = true; try { File file = new File(storeIdPath); if (!file.exists()) { file.createNewFile(); } FileReader fr = new FileReader(storeIdPath); BufferedReader br = new BufferedReader(fr); if (br.readLine() != null) { isEmpty = false; } br.close(); fr.close(); } catch (IOException e) { e.printStackTrace(); } //isEmpty=false表示增量索引 IndexWriter writer = new IndexWriter(path, luceneAnalyzer, isEmpty); String storeId = ""; boolean indexFlag = false; String id; String name; String address; String citycode; while (rs.next()) { id = rs.getInt("id") + ""; name = rs.getString("name"); address = rs.getString("address"); citycode = rs.getString("citycode"); writer.addDocument(Document(id, name, address, citycode)); storeId = id;//将拿到的id给storeId,这种拿法不合理,这里为了方便 indexFlag = true; } writer.optimize(); writer.close(); if (indexFlag) { // 将最后一个的ID存到磁盘文件中 writeStoreId(storeIdPath, storeId); } return true; } catch (Exception e) { e.printStackTrace(); System.out.println("出错了" + e.getClass() + "\n 错误信息为: " + e.getMessage()); return false; } } public static Document Document(String id, String name, String address, String citycode) { Document doc = new Document(); doc.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED)); doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));//查询字段 doc.add(new Field("address", address, Field.Store.YES, Field.Index.TOKENIZED)); doc.add(new Field("citycode", citycode, Field.Store.YES, Field.Index.TOKENIZED));//查询字段 return doc; } // 取得存储在磁盘中的ID public static String getStoreId(String path) { String storeId = ""; try { File file = new File(path); if (!file.exists()) { file.createNewFile(); } FileReader fr = new FileReader(path); BufferedReader br = new BufferedReader(fr); storeId = br.readLine(); if (storeId == null || storeId == "") storeId = "0"; br.close(); fr.close(); } catch (Exception e) { e.printStackTrace(); } return storeId; } // 将ID写入到磁盘文件中 public static boolean writeStoreId(String path, String storeId) { boolean b = false; try { File file = new File(path); if (!file.exists()) { file.createNewFile(); } FileWriter fw = new FileWriter(path); PrintWriter out = new PrintWriter(fw); out.write(storeId); out.close(); fw.close(); b = true; } catch (IOException e) { e.printStackTrace(); } return b; } }
3. 索引操作
package index; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.mira.lucene.analysis.IK_CAnalyzer; public class IndexUtils { //0. 创建增量索引 public static void buildIndex(String indexFile, String storeIdFile) { IncrementIndex.buildIndex(indexFile, storeIdFile); } //1. 单字段查询 @SuppressWarnings("deprecation") public static List<IndexResult> queryByOneKey(IndexSearcher indexSearcher, String field, String key) { try { Date date1 = new Date(); QueryParser queryParser = new QueryParser(field, new StandardAnalyzer()); Query query = queryParser.parse(key); Hits hits = indexSearcher.search(query); Date date2 = new Date(); System.out.println("耗时:" + (date2.getTime() - date1.getTime()) + "ms"); List<IndexResult> list = new ArrayList<IndexResult>(); for (int i = 0; i < hits.length(); i++) { list.add(getIndexResult(hits.doc(i))); } return list; } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } //2. 多条件查询。这里实现的是and操作 //注:要查询的字段必须是index的 //即doc.add(new Field("pid", rs.getString("pid"), Field.Store.YES,Field.Index.TOKENIZED)); @SuppressWarnings("deprecation") public static List<IndexResult> queryByMultiKeys(IndexSearcher indexSearcher, String[] fields, String[] keys) { try { BooleanQuery m_BooleanQuery = new BooleanQuery(); if (keys != null && keys.length > 0) { for (int i = 0; i < keys.length; i++) { QueryParser queryParser = new QueryParser(fields[i], new StandardAnalyzer()); Query query = queryParser.parse(keys[i]); m_BooleanQuery.add(query, BooleanClause.Occur.MUST);//and操作 } Hits hits = indexSearcher.search(m_BooleanQuery); List<IndexResult> list = new ArrayList<IndexResult>(); for (int i = 0; i < hits.length(); i++) { list.add(getIndexResult(hits.doc(i))); } return list; } } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } //3.高亮显示 实现了单条件查询 //可改造为多条件查询 public static List<IndexResult> highlight(IndexSearcher indexSearcher, String key) { try { QueryParser queryParser = new QueryParser("name", new StandardAnalyzer()); Query query = queryParser.parse(key); TopDocCollector collector = new TopDocCollector(800); indexSearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; Highlighter highlighter = null; SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(200)); List<IndexResult> list = new ArrayList<IndexResult>(); Document doc; for (int i = 0; i < hits.length; i++) { //System.out.println(hits[i].score); doc = indexSearcher.doc(hits[i].doc); TokenStream tokenStream = new StandardAnalyzer().tokenStream("name", new StringReader(doc.get("name"))); IndexResult ir = getIndexResult(doc); ir.setName(highlighter.getBestFragment(tokenStream, doc.get("name"))); list.add(ir); } return list; } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } //4. 多字段查询 @SuppressWarnings("deprecation") public static List<IndexResult> queryByMultiFileds(IndexSearcher indexSearcher, String[] fields, String key) { try { MultiFieldQueryParser mfq = new MultiFieldQueryParser(fields, new StandardAnalyzer()); Query query = mfq.parse(key); Hits hits = indexSearcher.search(query); List<IndexResult> list = new ArrayList<IndexResult>(); for (int i = 0; i < hits.length(); i++) { list.add(getIndexResult(hits.doc(i))); } return list; } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } //5. 删除索引 public static void deleteIndex(String indexFile, String id) throws CorruptIndexException, IOException { IndexReader indexReader = IndexReader.open(indexFile); indexReader.deleteDocuments(new Term("id", id)); indexReader.close(); } //6. 一元分词 @SuppressWarnings("deprecation") public static String Standard_Analyzer(String str) { Analyzer analyzer = new StandardAnalyzer(); Reader r = new StringReader(str); StopFilter sf = (StopFilter) analyzer.tokenStream("", r); System.out.println("=====StandardAnalyzer===="); System.out.println("分析方法:默认没有词只有字(一元分词)"); Token t; String results = ""; try { while ((t = sf.next()) != null) { System.out.println(t.termText()); results = results + " " + t.termText(); } } catch (IOException e) { e.printStackTrace(); } return results; } //7. 字典分词 @SuppressWarnings("deprecation") public static String ik_CAnalyzer(String str) { Analyzer analyzer = new IK_CAnalyzer(); Reader r = new StringReader(str); TokenStream ts = (TokenStream) analyzer.tokenStream("", r); System.out.println("=====IK_CAnalyzer===="); System.out.println("分析方法:字典分词,正反双向搜索"); Token t; String results = ""; try { while ((t = ts.next()) != null) { System.out.println(t.termText()); results = results + " " + t.termText(); } } catch (IOException e) { e.printStackTrace(); } return results; } //在结果中搜索 public static void queryFromResults() { } //组装对象 public static IndexResult getIndexResult(Document doc) { IndexResult ir = new IndexResult(); ir.setId(doc.get("id")); ir.setName(doc.get("name")); ir.setAddress(doc.get("address")); ir.setCitycode(doc.get("citycode")); return ir; } }
查询索引结果对象:IndexResult
package index; public class IndexResult { private String id; private String name; private String address; private String citycode; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getAddress() { return address; } public void setAddress(String address) { this.address = address; } public String getCitycode() { return citycode; } public void setCitycode(String citycode) { this.citycode = citycode; } }
4. 测试类
package test; /** * $Id$ * Copyright 2009-2010 Oak Pacific Interactive. All rights reserved. */ import index.IndexResult; import index.IndexUtils; import java.util.Date; import java.util.List; import org.apache.lucene.search.IndexSearcher; public class Test { //存放索引文件 private static String indexFile = "E:\\workspace2\\Test\\lucene_test\\poiIdext"; //存放id private static String storeIdFile = "E:\\workspace2\\Test\\lucene_test\\storeId.txt"; public static void main(String[] args) throws Exception { //0. 创建增量索引 IndexUtils.buildIndex(indexFile, storeIdFile); IndexSearcher indexSearcher = new IndexSearcher(indexFile); String key = IndexUtils.ik_CAnalyzer("静安中心"); //1.单字段查询 Date date1 = new Date(); List<IndexResult> list = IndexUtils.queryByOneKey(indexSearcher, "name", key); Date date2 = new Date(); System.out.println("耗时:" + (date2.getTime() - date1.getTime()) + "ms\n" + list.size() + "条=======================================单字段查询"); //printResults(list); //2.多条件查询 String[] fields = { "name", "citycode" }; String[] keys = { IndexUtils.ik_CAnalyzer("静安中心"), "0000" }; date1 = new Date(); list = IndexUtils.queryByMultiKeys(indexSearcher, fields, keys); date2 = new Date(); System.out.println("耗时:" + (date2.getTime() - date1.getTime()) + "ms\n" + list.size() + "条\n===============================多条件查询"); printResults(list); //3.高亮显示 单字段查询 System.out.println("\n\n"); date1 = new Date(); list = IndexUtils.highlight(indexSearcher, key); date2 = new Date(); System.out.println("耗时:" + (date2.getTime() - date1.getTime()) + "ms\n" + list.size() + "条\n======================================高亮显示"); // printResults(list); //4. 多字段查询 date1 = new Date(); list = IndexUtils.queryByMultiFileds(indexSearcher, fields, key); date2 = new Date(); System.out.println("耗时:" + (date2.getTime() - date1.getTime()) + "ms\n" + list.size() + "条\n=====================================多字段查询"); // printResults(list); //5. 删除索引中的字段 根据id进行删除 IndexUtils.deleteIndex(indexFile, "123"); } //打印结果 public static void printResults(List<IndexResult> list) { if (list != null && list.size() > 0) { for (int i = 0; i < list.size(); i++) { System.out.println(list.get(i).getId() + "," + list.get(i).getName() + "," + list.get(i).getAddress() + "," + list.get(i).getCitycode()+"--->"+i); } } } }
5. 其它
全文索引:
目前的情况是,搜索hello,"hello world"、"hi hello, how are you"但"worldhello"显示不出来
默认情况下,QueryParser不支持通配符打头的查询(如,*ook)。不过在Lucene 2.1版本以后,他们可以通过调用QueryParser.setAllowLeadingWildcard( true )的 方法打开这一功能。注意,这是一个开销很大的操作:它需要扫描索引中全部记号的列表,来寻找匹配这个模式的词。(译注:高效支持这种后缀查询的办法是,建立反序的记号表,Lucene没有实现这一模式。)http://www.codechina.org/faq/show/42/
支持空格分词搜索:"厕所 26 沈阳" 这是三个词
不支持:“厕所沈阳”这是一个词
http://www.codechina.org/faq/show/63/
可以。主要有两种做法:
我们推荐使用BooleanQuery的方法。
============
// 创建标准文本分析器, 标准的是可以支持的中文的
Analyzer luceneAnalyzer = new StandardAnalyzer();
indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true);
// 可以说是创建一个新的写入工具
// 第一个参数是要索引建立在哪个目录里
// 第二个参数是新建一个文本分析器,这里用的是标准的大家也可以自己写一个
// 第三个参数如果是true,在建立索引之前先将c: \\index目录清空
poi_data_ugc搜索中,索引放在内存里还是磁盘上????
针对于lucene使用和优化
http://hi.baidu.com/lewutian/blog/item/48a86d03de58b984d43f7c1b.html
ucene入门实例(1):索引文本文件
http://www.java3z.com/cwbwebhome/article/article5/51021.html