lucene源码分析---6

lucene源码分析—创建IndexReader

本章开始分析lucene的查询过程,下面先看一段lucene6版本下常用的查询代码,

        String indexPath;
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher searcher = new IndexSearcher(reader);
        ScoreDoc[] hits = null;
        Query query = null;
        Analyzer analyzer = new SimpleAnalyzer();
        try {
            QueryParser qp = new QueryParser("body", analyzer);
            query = qp.parse(words);
        } catch (ParseException e) {
            return null;
        }
        if (searcher != null) {
            TopDocs results = searcher.search(query, 20);
            hits = results.scoreDocs;
            Document document = null;
            for (int i = 0; i < hits.length; i++) {
                document = searcher.doc(hits[i].doc);
            }
            reader.close();
        }

indexPath表示索引文件夹的路径。FSDirectory的open函数前面几章已经分析过了,最后返回MMapDirectory、SimpleFSDirectory以及NIOFSDirectory其中之一,本章后面都假设为NIOFSDirectory。然后调用DirectoryReader的open函数创建一个IndexReader,如下所示,
DirectoryReader::open

  public static DirectoryReader open(final Directory directory) throws IOException {
    return StandardDirectoryReader.open(directory, null);
  }
  static DirectoryReader open(final Directory directory, final IndexCommit commit) throws IOException {
    return new SegmentInfos.FindSegmentsFile(directory) {
        ...
    }.run(commit);
  }

DirectoryReader的open函数调用StandardDirectoryReader的open函数,进而调用FindSegmentsFile的run函数,最后其实返回一个StandardDirectoryReader。
DirectoryReader::open->FindSegmentsFile::run

    public T run() throws IOException {
      return run(null);
    }

    public T run(IndexCommit commit) throws IOException {
      if (commit != null) {
        ...
      }

      long lastGen = -1;
      long gen = -1;
      IOException exc = null;

      for (;;) {
        lastGen = gen;
        String files[] = directory.listAll();
        String files2[] = directory.listAll();
        Arrays.sort(files);
        Arrays.sort(files2);
        if (!Arrays.equals(files, files2)) {
          continue;
        }
        gen = getLastCommitGeneration(files);

        if (gen == -1) {
          throw new IndexNotFoundException();
        } else if (gen > lastGen) {
          String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", gen);
          try {
            T t = doBody(segmentFileName);
            return t;
          } catch (IOException err) {

          }
        } else {
          throw exc;
        }
      }
    }

假设索引文件夹下有文件segments_0,segments_1,segments.gen,上面这段代码中的getLastCommitGeneration返回1,即以”segments”开头的文件名里结尾最大的数字,fileNameFromGeneration返回segments_1。最重要的是doBody函数,用来将文件中的段以及域信息读入内存数据结构中,doBody在DirectoryReader的open中被重载,定义如下,
DirectoryReader::open->FindSegmentsFile::run->doBody

      protected DirectoryReader doBody(String segmentFileName) throws IOException {
        SegmentInfos sis = SegmentInfos.readCommit(directory, segmentFileName);
        final SegmentReader[] readers = new SegmentReader[sis.size()];
        boolean success = false;
        try {
          for (int i = sis.size()-1; i >= 0; i--) {
            readers[i] = new SegmentReader(sis.info(i), IOContext.READ);
          }
          DirectoryReader reader = new StandardDirectoryReader(directory, readers, null, sis, false, false);
          success = true;

          return reader;
        } finally {

        }
      }

doBody首先通过SegmentInfos的readCommit函数读取段信息存入SegmentInfos,然后根据该段信息创建SegmentReader,SegmentReader的构造函数会读取每个段中的域信息并存储在SegmentReader的成员变量里。先来看SegmentInfos的readCommit函数,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentInfos::readCommit

  public static final SegmentInfos readCommit(Directory directory, String segmentFileName) throws IOException {

    long generation = generationFromSegmentsFileName(segmentFileName);
    try (ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ)) {
      return readCommit(directory, input, generation);
    }
  }

  public ChecksumIndexInput openChecksumInput(String name, IOContext context) throws IOException {
    return new BufferedChecksumIndexInput(openInput(name, context));
  }

  public IndexInput openInput(String name, IOContext context) throws IOException {
    ensureOpen();
    ensureCanRead(name);
    Path path = getDirectory().resolve(name);
    FileChannel fc = FileChannel.open(path, StandardOpenOption.READ);
    return new NIOFSIndexInput("NIOFSIndexInput(path=\"" + path + "\")", fc, context);
  }

假设传入的段文件名为segments_1,上面代码中的generationFromSegmentsFileName函数返回1。readCommit函数首先通过openChecksumInput创建BufferedChecksumIndexInput,代表文件的输入流,其中的openInput函数用来创建NIOFSIndexInput,然后根据该输入流通过readCommit函数读取文件内容,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentInfos::readCommit

  public static final SegmentInfos readCommit(Directory directory, ChecksumIndexInput input, long generation) throws IOException {

    int magic = input.readInt();
    if (magic != CodecUtil.CODEC_MAGIC) {
      throw new IndexFormatTooOldException();
    }
    int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_50, VERSION_CURRENT);
    byte id[] = new byte[StringHelper.ID_LENGTH];
    input.readBytes(id, 0, id.length);
    CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));

    SegmentInfos infos = new SegmentInfos();
    infos.id = id;
    infos.generation = generation;
    infos.lastGeneration = generation;
    if (format >= VERSION_53) {
      infos.luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
    } else {

    }

    infos.version = input.readLong();
    infos.counter = input.readInt();
    int numSegments = input.readInt();

    if (format >= VERSION_53) {
      if (numSegments > 0) {
        infos.minSegmentLuceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
      } else {

      }
    } else {

    }

    long totalDocs = 0;
    for (int seg = 0; seg < numSegments; seg++) {
      String segName = input.readString();
      final byte segmentID[];
      byte hasID = input.readByte();
      if (hasID == 1) {
        segmentID = new byte[StringHelper.ID_LENGTH];
        input.readBytes(segmentID, 0, segmentID.length);
      } else if (hasID == 0) {

      } else {

      }
      Codec codec = readCodec(input, format < VERSION_53);
      SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ);
      info.setCodec(codec);
      totalDocs += info.maxDoc();
      long delGen = input.readLong();
      int delCount = input.readInt();

      long fieldInfosGen = input.readLong();
      long dvGen = input.readLong();
      SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, delGen, fieldInfosGen, dvGen);
      if (format >= VERSION_51) {
        siPerCommit.setFieldInfosFiles(input.readSetOfStrings());
      } else {
        siPerCommit.setFieldInfosFiles(Collections.unmodifiableSet(input.readStringSet()));
      }
      final MapSet> dvUpdateFiles;
      final int numDVFields = input.readInt();
      if (numDVFields == 0) {
        dvUpdateFiles = Collections.emptyMap();
      } else {
        MapSet> map = new HashMap<>(numDVFields);
        for (int i = 0; i < numDVFields; i++) {
          if (format >= VERSION_51) {
            map.put(input.readInt(), input.readSetOfStrings());
          } else {
            map.put(input.readInt(), Collections.unmodifiableSet(input.readStringSet()));
          }
        }
        dvUpdateFiles = Collections.unmodifiableMap(map);
      }
      siPerCommit.setDocValuesUpdatesFiles(dvUpdateFiles);
      infos.add(siPerCommit);

      Version segmentVersion = info.getVersion();
      if (format < VERSION_53) {
        if (infos.minSegmentLuceneVersion == null || segmentVersion.onOrAfter(infos.minSegmentLuceneVersion) == false) {
          infos.minSegmentLuceneVersion = segmentVersion;
        }
      }
    }

    if (format >= VERSION_51) {
      infos.userData = input.readMapOfStrings();
    } else {
      infos.userData = Collections.unmodifiableMap(input.readStringStringMap());
    }
    CodecUtil.checkFooter(input);
    return infos;
  }

readCommit函数较长,归纳起来,就是针对所有的段信息,读取并设置id、generation、lastGeneration、luceneVersion、version、counter、minSegmentLuceneVersion、userData等信息;
并且针对每个段,读取或设置段名、段ID、该段删除的文档数、删除文档的gen数字,域文件的gen数字,更新的文档的gen数字、该段域信息文件名、该段更新的文件名,最后将这些信息封装成SegmentInfos并返回。
其中,针对每个段,通过segmentInfoFormat函数获得Lucene50SegmentInfoFormat,调用其read函数读取各个信息封装成SegmentInfo,代码如下,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentInfos::readCommit->Lucene50SegmentInfoFormat::read

  public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
    final String fileName = IndexFileNames.segmentFileName(segment, "", Lucene50SegmentInfoFormat.SI_EXTENSION);
    try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
      Throwable priorE = null;
      SegmentInfo si = null;
      try {
        int format = CodecUtil.checkIndexHeader(input, Lucene50SegmentInfoFormat.CODEC_NAME,
                                          Lucene50SegmentInfoFormat.VERSION_START,
                                          Lucene50SegmentInfoFormat.VERSION_CURRENT,
                                          segmentID, "");
        final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());

        final int docCount = input.readInt();
        final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;

        final Map diagnostics;
        final Set files;
        final Map attributes;

        if (format >= VERSION_SAFE_MAPS) {
          diagnostics = input.readMapOfStrings();
          files = input.readSetOfStrings();
          attributes = input.readMapOfStrings();
        } else {
          diagnostics = Collections.unmodifiableMap(input.readStringStringMap());
          files = Collections.unmodifiableSet(input.readStringSet());
          attributes = Collections.unmodifiableMap(input.readStringStringMap());
        }

        si = new SegmentInfo(dir, version, segment, docCount, isCompoundFile, null, diagnostics, segmentID, attributes);
        si.setFiles(files);
      } catch (Throwable exception) {
        priorE = exception;
      } finally {
        CodecUtil.checkFooter(input, priorE);
      }
      return si;
    }
  }

该read函数打开.si文件,并从中读取version、docCount、isCompoundFile、diagnostics、attributes、files信息,然后创建SegmentInfo封装这些信息并返回。

回到FindSegmentsFile的doBody函数中,从文件中所有的段信息通过readCommit函数封装成SegmentInfos,然后针对每个段,创建SegmentReader,在其构造函数中读取域信息。
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader

  public SegmentReader(SegmentCommitInfo si, IOContext context) throws IOException {
    this.si = si;
    core = new SegmentCoreReaders(si.info.dir, si, context);
    segDocValues = new SegmentDocValues();

    boolean success = false;
    final Codec codec = si.info.getCodec();
    try {
      if (si.hasDeletions()) {
        liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
      } else {
        liveDocs = null;
      }
      numDocs = si.info.maxDoc() - si.getDelCount();
      fieldInfos = initFieldInfos();
      docValuesProducer = initDocValuesProducer();

      success = true;
    } finally {

    }
  }

si.info.dir就是索引文件所在的文件夹,先来看SegmentCoreReaders的构造函数,SegmentCoreReaders的构造函数中会读取域信息,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader->SegmentCoreReaders::SegmentCoreReaders

  SegmentCoreReaders(Directory dir, SegmentCommitInfo si, IOContext context) throws IOException {

    final Codec codec = si.info.getCodec();
    final Directory cfsDir;
    boolean success = false;

    try {
      if (si.info.getUseCompoundFile()) {
        cfsDir = cfsReader = codec.compoundFormat().getCompoundReader(dir, si.info, context);
      } else {
        cfsReader = null;
        cfsDir = dir;
      }

      coreFieldInfos = codec.fieldInfosFormat().read(cfsDir, si.info, "", context);

      final SegmentReadState segmentReadState = new SegmentReadState(cfsDir, si.info, coreFieldInfos, context);
      final PostingsFormat format = codec.postingsFormat();
      fields = format.fieldsProducer(segmentReadState);

      if (coreFieldInfos.hasNorms()) {
        normsProducer = codec.normsFormat().normsProducer(segmentReadState);
        assert normsProducer != null;
      } else {
        normsProducer = null;
      }

      fieldsReaderOrig = si.info.getCodec().storedFieldsFormat().fieldsReader(cfsDir, si.info, coreFieldInfos, context);

      if (coreFieldInfos.hasVectors()) {
        termVectorsReaderOrig = si.info.getCodec().termVectorsFormat().vectorsReader(cfsDir, si.info, coreFieldInfos, context);
      } else {
        termVectorsReaderOrig = null;
      }

      if (coreFieldInfos.hasPointValues()) {
        pointsReader = codec.pointsFormat().fieldsReader(segmentReadState);
      } else {
        pointsReader = null;
      }
      success = true;
    } finally {

    }
  }

getUseCompoundFile表示是否会封装成.cfs、.cfe文件,如果封装,就通过compoundFormat函数获得Lucene50CompoundFormat,然后调用其getCompoundReader函数,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader->SegmentCoreReaders::SegmentCoreReaders->Lucene50CompoundFormat::getCompoundReader

  public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
    return new Lucene50CompoundReader(dir, si, context);
  }

  public Lucene50CompoundReader(Directory directory, SegmentInfo si, IOContext context) throws IOException {
    this.directory = directory;
    this.segmentName = si.name;
    String dataFileName = IndexFileNames.segmentFileName(segmentName, "", Lucene50CompoundFormat.DATA_EXTENSION);
    String entriesFileName = IndexFileNames.segmentFileName(segmentName, "", Lucene50CompoundFormat.ENTRIES_EXTENSION);
    this.entries = readEntries(si.getId(), directory, entriesFileName);
    boolean success = false;

    long expectedLength = CodecUtil.indexHeaderLength(Lucene50CompoundFormat.DATA_CODEC, "");
    for(Map.Entry ent : entries.entrySet()) {
      expectedLength += ent.getValue().length;
    }
    expectedLength += CodecUtil.footerLength(); 

    handle = directory.openInput(dataFileName, context);
    try {
      CodecUtil.checkIndexHeader(handle, Lucene50CompoundFormat.DATA_CODEC, version, version, si.getId(), "");
      CodecUtil.retrieveChecksum(handle);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(handle);
      }
    }
  }

getCompoundReader用来创建Lucene50CompoundReader。Lucene50CompoundReader的构造函数打开.cfs以及.cfe文件,然后通过readEntries函数将其中包含的文件读取出来,存入entries中。

回到SegmentCoreReaders的构造函数。fieldInfosFormat返回Lucene60FieldInfosFormat,其read函数用来读取域信息,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader->SegmentCoreReaders::SegmentCoreReaders->Lucene60FieldInfosFormat::read

  public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
    final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
    try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
      Throwable priorE = null;
      FieldInfo infos[] = null;
      try {
        CodecUtil.checkIndexHeader(input,
                                   Lucene60FieldInfosFormat.CODEC_NAME, 
                                   Lucene60FieldInfosFormat.FORMAT_START, 
                                   Lucene60FieldInfosFormat.FORMAT_CURRENT,
                                   segmentInfo.getId(), segmentSuffix);

        final int size = input.readVInt();
        infos = new FieldInfo[size];

        Map lastAttributes = Collections.emptyMap();

        for (int i = 0; i < size; i++) {
          String name = input.readString();
          final int fieldNumber = input.readVInt();
          byte bits = input.readByte();
          boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
          boolean omitNorms = (bits & OMIT_NORMS) != 0;
          boolean storePayloads = (bits & STORE_PAYLOADS) != 0;

          final IndexOptions indexOptions = getIndexOptions(input, input.readByte());
          final DocValuesType docValuesType = getDocValuesType(input, input.readByte());
          final long dvGen = input.readLong();
          Map attributes = input.readMapOfStrings();

          if (attributes.equals(lastAttributes)) {
            attributes = lastAttributes;
          }
          lastAttributes = attributes;
          int pointDimensionCount = input.readVInt();
          int pointNumBytes;
          if (pointDimensionCount != 0) {
            pointNumBytes = input.readVInt();
          } else {
            pointNumBytes = 0;
          }

          try {
            infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, 
                                     indexOptions, docValuesType, dvGen, attributes,
                                     pointDimensionCount, pointNumBytes);
            infos[i].checkConsistency();
          } catch (IllegalStateException e) {

          }
        }
      } catch (Throwable exception) {
        priorE = exception;
      } finally {
        CodecUtil.checkFooter(input, priorE);
      }
      return new FieldInfos(infos);
    }
  }

该read函数打开.fnm文件,读取Field域的基本信息。然后遍历所有域,读取name域名、fieldNumber文档数量,storeTermVector是否存储词向量、omitNorms是否存储norm、storePayloads是否存储payload、indexOptions域存储方式、docValuesType文档内容类型、文档的gen、attributes、pointDimensionCount、pointNumBytes,最后封装成FieldInfo,再封装成FieldInfos。

回到SegmentCoreReaders构造函数。接下来的postingsFormat函数返回PerFieldPostingsFormat,其fieldsProducer函数最终设置fields为FieldsReader。
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader->SegmentCoreReaders::SegmentCoreReaders->PerFieldPostingsFormat::fieldsProducer

  public final FieldsProducer fieldsProducer(SegmentReadState state)
      throws IOException {
    return new FieldsReader(state);
  }

normsFormat函数返回Lucene53NormsFormat,Lucene53NormsFormat的normsProducer函数返回Lucene53NormsProducer,赋值给normsProducer。

  public NormsProducer normsProducer(SegmentReadState state) throws IOException {
    return new Lucene53NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
  }

再往下,依次分析,fieldsReaderOrig最终被赋值为CompressingStoredFieldsReader。termVectorsReaderOrig最终被赋值为CompressingTermVectorsReader。pointsReader最终被赋值为Lucene60PointsReader。

回到SegmentReader构造函数,现在已经读取了所有的段信息和域信息了,接下来如果段中有删除信息,就通过liveDocsFormat函数获得Lucene50LiveDocsFormat,并调用其readLiveDocs函数,
DirectoryReader::open->FindSegmentsFile::run->doBody->SegmentReader::SegmentReader->Lucene50LiveDocsFormat::readLiveDocs

  public Bits readLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) throws IOException {
    long gen = info.getDelGen();
    String name = IndexFileNames.fileNameFromGeneration(info.info.name, EXTENSION, gen);
    final int length = info.info.maxDoc();
    try (ChecksumIndexInput input = dir.openChecksumInput(name, context)) {
      Throwable priorE = null;
      try {
        CodecUtil.checkIndexHeader(input, CODEC_NAME, VERSION_START, VERSION_CURRENT, 
                                     info.info.getId(), Long.toString(gen, Character.MAX_RADIX));
        long data[] = new long[FixedBitSet.bits2words(length)];
        for (int i = 0; i < data.length; i++) {
          data[i] = input.readLong();
        }
        FixedBitSet fbs = new FixedBitSet(data, length);
        return fbs;
      } catch (Throwable exception) {
        priorE = exception;
      } finally {
        CodecUtil.checkFooter(input, priorE);
      }
    }
  }

readLiveDocs函数打开.liv文件,创建输入流,然后读取并创建FixedBitSet用来标识哪些文件被删除。

回到SegmentReader构造函数。接下来的initFieldInfos函数将SegmentCoreReaders中的coreFieldInfos赋值给fieldInfos,如果段有更新,就重新读取一次。docValuesProducer函数最后会返回FieldsReader。

再回到FindSegmentsFile的doBody函数中,最后创建StandardDirectoryReader并返回。StandardDirectoryReader本身的构造函数较为简单,值得注意的是StandardDirectoryReader的父类CompositeReader的

回到实例中,接下来创建IndexSearcher以及QueryParser,这两个类的构造函数都没有关键内容,这里就不往下看了。
值得注意的是IndexSearcher的构造函数会调用StandardDirectoryReader的getContext函数,进而调用leaves函数,首先是getContext函数,定义在StandardDirectoryReader的父类CompositeReader中,
StandardDirectoryReader::getContext

  public final CompositeReaderContext getContext() {
    ensureOpen();
    if (readerContext == null) {
      readerContext = CompositeReaderContext.create(this);
    }
    return readerContext;
  }

ensureOpen用来确保IndexWriter未关闭,接下来通过create函数创建CompositeReaderContext,
CompositeReaderContext::create

    static CompositeReaderContext create(CompositeReader reader) {
      return new Builder(reader).build();
    }

    public CompositeReaderContext build() {
      return (CompositeReaderContext) build(null, reader, 0, 0);
    }

    private IndexReaderContext build(CompositeReaderContext parent, IndexReader reader, int ord, int docBase) {
      if (reader instanceof LeafReader) {
        final LeafReader ar = (LeafReader) reader;
        final LeafReaderContext atomic = new LeafReaderContext(parent, ar, ord, docBase, leaves.size(), leafDocBase);
        leaves.add(atomic);
        leafDocBase += reader.maxDoc();
        return atomic;
      } else {
        final CompositeReader cr = (CompositeReader) reader;
        final List sequentialSubReaders = cr.getSequentialSubReaders();
        final List children = Arrays.asList(new IndexReaderContext[sequentialSubReaders.size()]);
        final CompositeReaderContext newParent;
        if (parent == null) {
          newParent = new CompositeReaderContext(cr, children, leaves);
        } else {
          newParent = new CompositeReaderContext(parent, cr, ord, docBase, children);
        }
        int newDocBase = 0;
        for (int i = 0, c = sequentialSubReaders.size(); i < c; i++) {
          final IndexReader r = sequentialSubReaders.get(i);
          children.set(i, build(newParent, r, i, newDocBase));
          newDocBase += r.maxDoc();
        }
        assert newDocBase == cr.maxDoc();
        return newParent;
      }
    }

首先,getSequentialSubReaders函数返回的正是在FindSegmentsFile的doBody函数中为每个段创建的SegmentReader列表,接下来创建CompositeReaderContext,接下来为每个SegmentReader嵌套调用build函数并设置进children中,而SegmentReader继承自LeafReader,因此在嵌套调用的build函数中,会将每个SegmentReader封装为LeafReaderContext并设置进leaves列表中。

因此最后的leaves函数返回封装了SegmentReader的LeafReaderContext列表。

下一章开始分析QueryParser的parse函数。

你可能感兴趣的:(lucene源码分析---6)