在Lucene4.6中,想要实现搜索结果按照时间倒序的效果:如果两个文档得分相同,那么就按照发布时间倒序排列;否则就按照分数排列。这种效果在Lucene4.6中实现起来极其简单,直接利用search接口的Sort参数即可达成,完全不需要像某些人说的重写Similarity那么麻烦。三两行代码的事情,体现了Make it simple, stupid的精髓。
首先来看看测试例子,这个例子中我建立了四个文档,按照内容-发布日期来表示分别是:
2004年光棍节攻略 , 20041111
2005年光棍节攻略 , 20051111
2006年光棍节攻略 , 20061111
游戏攻略 ,20141111
统一使用“光棍节攻略”来搜索它们,用户希望最新的光棍节攻略排在第一。
如果不做排序处理的话,用户体验非常糟糕:
package com.hankcs.test; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.queries.CustomScoreQuery; import org.apache.lucene.queries.function.FunctionQuery; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.IOException; /** * @author hankcs */ public class TestSortByTime { public static void main(String[] args) { // Lucene Document的主要域名 String fieldName = "text"; // 实例化IKAnalyzer分词器 Analyzer analyzer = new IKAnalyzer(); Directory directory = null; IndexWriter iwriter; IndexReader ireader = null; IndexSearcher isearcher; try { //索引过程********************************** //建立内存索引对象 directory = new RAMDirectory(); //配置IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_46, analyzer); iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); //写入索引 for (int i = 0; i < 3; ++i) { int year = 2004 + i; Document doc = new Document(); doc.add(new TextField(fieldName, year + "年光棍节攻略", Field.Store.YES)); doc.add(new IntField("date", year * 10000 + 1111, Field.Store.YES)); iwriter.addDocument(doc); } // 加入一个干扰文档 Document doc = new Document(); doc.add(new TextField(fieldName, "游戏攻略", Field.Store.YES)); doc.add(new IntField("date", 20141111, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //搜索过程********************************** //实例化搜索器 ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "光棍节攻略"; //使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(Version.LUCENE_46, fieldName, analyzer); Query query = qp.parse(keyword); System.out.println("Query = " + query); //搜索相似度最高的5条记录 TopDocs topDocs = isearcher.search(query, 5); System.out.println("命中:" + topDocs.totalHits); //输出结果 ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < Math.min(5, scoreDocs.length); i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.print(targetDoc.getField(fieldName).stringValue()); System.out.print(" , " + targetDoc.getField("date").numericValue()); System.out.println(" , " + scoreDocs[i].score); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } } }
输出:
2004年光棍节攻略 , 20041111 , 0.71185887 2005年光棍节攻略 , 20051111 , 0.71185887 2006年光棍节攻略 , 20061111 , 0.71185887 游戏攻略 , 20141111 , 0.049675122
可以看到文档是严格按照分数排序的,如果分数相同,则按照索引顺序排序,导致最新的文章反而排在最下面。
使用search接口的Sort参数优化搜索结果:
package com.hankcs.test; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.queries.CustomScoreQuery; import org.apache.lucene.queries.function.FunctionQuery; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.IOException; /** * @author hankcs */ public class TestSortByTime { public static void main(String[] args) { // Lucene Document的主要域名 String fieldName = "text"; // 实例化IKAnalyzer分词器 Analyzer analyzer = new IKAnalyzer(); Directory directory = null; IndexWriter iwriter; IndexReader ireader = null; IndexSearcher isearcher; try { //索引过程********************************** //建立内存索引对象 directory = new RAMDirectory(); //配置IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_46, analyzer); iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); //写入索引 for (int i = 0; i < 3; ++i) { int year = 2004 + i; Document doc = new Document(); doc.add(new TextField(fieldName, year + "年光棍节攻略", Field.Store.YES)); doc.add(new IntField("date", year * 10000 + 1111, Field.Store.YES)); iwriter.addDocument(doc); } // 加入一个干扰文档 Document doc = new Document(); doc.add(new TextField(fieldName, "游戏攻略", Field.Store.YES)); doc.add(new IntField("date", 20141111, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //搜索过程********************************** //实例化搜索器 ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "光棍节攻略"; //使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(Version.LUCENE_46, fieldName, analyzer); Query query = qp.parse(keyword); System.out.println("Query = " + query); //搜索相似度最高的5条记录 Sort sort = new Sort(new SortField("text", SortField.Type.SCORE), new SortField("date", SortField.Type.INT, true)); TopDocs topDocs = isearcher.search(query, 5, sort); System.out.println("命中:" + topDocs.totalHits); //输出结果 ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < Math.min(5, scoreDocs.length); i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.print(targetDoc.getField(fieldName).stringValue()); System.out.print(" , " + targetDoc.getField("date").numericValue()); System.out.println(" , " + scoreDocs[i].score); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } } }
输出结果:
命中:4 2006年光棍节攻略 , 20061111 , NaN 2005年光棍节攻略 , 20051111 , NaN 2004年光棍节攻略 , 20041111 , NaN 游戏攻略 , 20141111 , NaN
我们看到“2006年光棍节攻略”因为时间比较新,并且相关性高,就排在了第一。“2005年光棍节攻略”相关度相同,因为时间旧就排在后面一点,而干扰文档“游戏攻略”即使时间最新,因为不相关的原因排在最后面。这种效果正好是我想要的,极大提升了用户体验。