lucene FieldCache 实现分组统计

 转自: http://www.czh123.com/blogitem440.html

 

所谓分组统计,就是类似sql里group by的功能。在solr里,这个功能称为faceting。lucene本身不支持分组统计,不过可以使用fieldCache来实现分组统计功能,而且也有很好的性能。solr根据不同的情况,还提供了其他方法(filterCache和UnInvertedField)来实现,这个以后再说。 
fieldCache是lucene用来排序的缓存。对要用来排序的字段,lucene会从索引中将每篇文档该字段的值都读出来,放到一个大小为maxDoc的数组中。maxDoc是lucene内部文档编号的最大值。有两点需要注意一下:

  • fieldCache中的字段值是从倒排表中读出来的,而不是从索引文件中存储的字段值,所以排序的字段必须是为设为索引字段
  • 用来排序的字段在索引的时候不能拆分(tokenized),因为fieldCache数组中,每个文档只对应一个字段值,拆分的话,cache中只会保存在词典中靠后的值。
fieldcache是lucene最占用的内存的部分,大部分内存溢出的错误都是由它而起,需要特别注意。

分组统计可以借用fieldCache来高效率的实现。调用lucene进行查询,通过读取倒排表并进行boolean运算,得到一个满足条件的文档的集合。通过每个结果文档号读取fieldCache数组中的值,并分不同的值累加数目,即可实现分组统计的功能。其中,如果某个字段对应多值,则在索引的时候不拆分,从filedCache数组读出后,再进行拆分统计。
好了,说了半天,现在来看看实现代码:Test.java
import java.io.IOException;
import java.util.List;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.HBxx2Similarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

public class Test {

 String path = "D:\\index";
 Version version = Version.LUCENE_29;

 @SuppressWarnings("deprecation")
 public void search(int pageNO, int pageSize) throws ParseException {
  try {

   Long start1 = System.currentTimeMillis();

   int start = (pageNO - 1) * pageSize;
   int topCount = pageSize * pageNO;
   IndexReader reader = IndexReader.open(FSDirectory
     .getDirectory(path));
   Searcher searcher = new IndexSearcher(reader);

   TopDocsCollector collector = TopScoreDocCollector.create(topCount,
     false);
   // 读取"fenlei"字段值,放到fieldCache中
   final String[] fc = FieldCache.DEFAULT.getStrings(reader, "fenlei");
   // GroupCollector是自定义文档收集器,用于实现分组统计
   GroupCollector groupCollector = new GroupCollector(collector, fc);

   searcher.search(new MatchAllDocsQuery(), groupCollector);

   // GroupField用来保存分组统计的结果
   GroupField gf = groupCollector.getGroupField();
   System.out.println("分组信息");
   List<String> values = gf.getValues();
   for (String value : values) {
    System.out.println(value + "=" + gf.getCountMap().get(value));
   }

   // 搜索结果总数
   int totalHits = collector.getTotalHits();
   System.out.println("总数:" + totalHits);

   System.out.println("分页结果");
   // 获取分页后搜索结果
   ScoreDoc[] scoreDocs = collector.topDocs(start, pageSize).scoreDocs;
   for (int i = 0; i < scoreDocs.length; i++) {
    int docId = scoreDocs[i].doc;
    Document doc = reader.document(docId);
    System.out.println("id:" + doc.get("id") + " fenlei:"
      + doc.get("fenlei") + " title:" + doc.get("title"));
   }

   Long time = System.currentTimeMillis() - start1;
   System.out.println("搜索所用时间为:" + time + "毫秒");

  } catch (IOException e) {
   e.printStackTrace();
  }

 }

 @SuppressWarnings("deprecation")
 public void WriteIndex() throws CorruptIndexException,
   LockObtainFailedException, IOException {

  Long start = System.currentTimeMillis();

  // 分词器
  PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
    new StandardAnalyzer(version));
  analyzer.addAnalyzer("title", new MMAnalyzer());// 中文分词器

  IndexWriter writer = new IndexWriter(FSDirectory.getDirectory(path),
    analyzer, MaxFieldLength.LIMITED);
  writer.setSimilarity(new HBxx2Similarity());
  writer.setMaxBufferedDocs(2048);
  writer.setRAMBufferSizeMB(256);

  int count = 0;
  String title = "中国人民  测试数据";
  String fenlei = "分类";

  // 开始读取数据创建索引
  int max = 1000000;
  int groupMax = 75000;
  for (int i = 0; i < max; i++) {
   if (i % groupMax == 0) {
    count++;
    System.out.println(i);
   }

   Document document = new Document();

   Field idField = new Field("id", Integer.toString(i + 1), Store.YES,
     Index.NOT_ANALYZED);
   Field titleField = new Field("title", title + (i + 1), Store.YES,
     Index.ANALYZED);
   Field fenleiField = new Field("fenlei", fenlei + count, Store.YES,
     Index.NOT_ANALYZED);

   document.add(idField);
   document.add(titleField);
   document.add(fenleiField);

   writer.addDocument(document);
  }
  writer.commit();
  writer.optimize();
  writer.close();

  Long time = System.currentTimeMillis() - start;
  System.out.println("创建索引所用时间为:" + time + "毫秒");
 }

 public static void main(String[] args) throws CorruptIndexException,
   IOException, ParseException {
  Test test = new Test();
  // 建立索引
  // test.WriteIndex();
  // 搜索索引
  int pageNO = 100, pageSize = 20;
  test.search(pageNO, pageSize);
 }
}


GroupField.java


import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 用于保存分组统计后每个字段的分组结果
 */
public class GroupField {
 /**
  * 字段名
  */
 private String name;

 /**
  * 所有可能的分组字段值,排序按每个字段值的文档个数大小排序
  */
 private List<String> values = new ArrayList<String>();

 /**
  * 保存字段值和文档个数的对应关系
  */
 private Map<String, Integer> countMap = new HashMap<String, Integer>();

 public Map<String, Integer> getCountMap() {
  return countMap;
 }

 public void setCountMap(Map<String, Integer> countMap) {
  this.countMap = countMap;
 }

 public String getName() {
  return name;
 }

 public void setName(String name) {
  this.name = name;
 }

 public List<String> getValues() {
  Collections.sort(values, new ValueComparator());
  return values;
 }

 public void setValues(List<String> values) {
  this.values = values;
 }

 public void addValue(String value) {
  if (value == null || "".equals(value))
   return;
  // 对于多值的字段,支持按空格拆分
  String[] temp = value.split(" ");
  for (String str : temp) {
   if (countMap.get(str) == null) {
    countMap.put(str, 1);
    values.add(str);
   } else {
    countMap.put(str, countMap.get(str) + 1);
   }
  }
 }

 class ValueComparator implements Comparator<String> {

  public int compare(String value0, String value1) {
   if (countMap.get(value0) > countMap.get(value1)) {
    return -1;
   } else if (countMap.get(value0) < countMap.get(value1)) {
    return 1;
   }
   return 0;
  }
 }
}

 GroupCollector.java


import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocsCollector;

public  class GroupCollector extends TopDocsCollector {

 Collector collector;
 int docBase;

 private String[] fc; // fieldCache
 private GroupField gf = new GroupField();// 保存分组统计结果

 GroupCollector(Collector topDocsCollector, String[] fieldCache)
   throws IOException {
  super(null);
  collector = topDocsCollector;
  this.fc = fieldCache;
 }

 @Override
 public void collect(int doc) throws IOException {
  collector.collect(doc);
  // 因为doc是每个segment的文档编号,需要加上docBase才是总的文档编号
  int docId = doc + docBase;
  // 添加的GroupField中,由GroupField负责统计每个不同值的数目
  gf.addValue(fc[docId]);
 }

 @Override
 public void setNextReader(IndexReader reader, int docBase)
   throws IOException {
  collector.setNextReader(reader, docBase);
  this.docBase = docBase;
 }

 @Override
 public void setScorer(Scorer scorer) throws IOException {
  collector.setScorer(scorer);
 }

 @Override
 public boolean acceptsDocsOutOfOrder() {
  return collector.acceptsDocsOutOfOrder();
 }

 public void setFc(String[] fc) {
  this.fc = fc;
 }

 public GroupField getGroupField() {
  return gf;
 }
}

你可能感兴趣的:(Lucene,fieldcache)