lucene.net索引文件存储简析

在lucene.net中,典型的索引文件操作代码如下:
IndexWriter writer = new IndexWriter("c:\index", new StandardAnalyzer(), true);
try
{
   Document doc = new Document();
   doc.Add(Field.Keyword("name", "name name");
   doc.Add(Field.Text("title", "title title");
   doc.Add(Field.Text("content", "content content");
   writer.AddDocument();
}
finally{    
   writer.Optimize();
   writer.Close();
}

在上面的代码中:
IndexWriter专门用于索引文件的写入操作;
StandardAnalyzer是一个分析器,用于对要索引的内容进行切分处理;
Document表示一条被索引的记录;

下面简单分析一下索引文件的创建过程

01. 创建IndexWriter

IndexWriter有几个重载的构造函数,它们都调用私有构造函数
private IndexWriter(Directory d, Analyzer a, bool create, bool closeDir) {
// 初始化及锁定处理略...
   lock (directory) {
   // in- & inter-process sync
      new AnonymousClassWith(create, this, directory.MakeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
   }
}
参数Directory是Lucene.net内实现的一个存储结构,它有基于文件系统的FSDirectory和基于内存的RAMDirectory两个版本;
Analyzer为分析器,用于索引内容的切分,StandardAnalyzer是Lucene.net内的一个标准的分析器实现,支持中英文,不过中文是按单字切分的;
create指定是否创建索引目录,如果要在同一文件夹内进行增量索引,create应设置为false;

AnonymousClassWith是一个辅助类,用于完成索引目录的初始化操作,
// class Lock.With (lock.cs)
public virtual Object Run() {
bool locked = false;
try {
   locked = lock_Renamed.Obtain(lockWaitTimeout);
   return DoBody();
}
finally {
   if (locked)
    lock_Renamed.Release();
}
}
先取得一个操作锁对象,然后进行具体的操作,避免多个线程同时进行一项操作。
关于lucene的锁操作在后续文章中介绍,

// class IndexWriter.AnonymousClassWith
public override Object DoBody() {
if (create)
   Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory);
else
   Enclosing_Instance.segmentInfos.Read(Enclosing_Instance.directory);
return null;
}
根据create的值决定是新索引目录(Write)还是使用现有索引目录(Read);
segmentInfos用于记录索引目录下的索引文件信息.

// class SegmentInfos
public void Write(Directory directory) {
OutputStream output = directory.CreateFile("segments.new");
try {
   output.WriteInt(FORMAT); // write FORMAT
   output.WriteLong(++version); // every write changes the index
   output.WriteInt(counter); // write counter
   output.WriteInt(Count); // write infos
   for (int i = 0; i < Count; i++) {
    SegmentInfo si = Info(i);
    output.WriteString(si.name);
    output.WriteInt(si.docCount);
   }
}
finally {
   output.Close();
}   
directory.RenameFile("segments.new", "segments");
}
创建一个索引信息文件,lucene.net实现了一套与平台无关的存储机制,采用按字节进行读取和写入,并定义了以下几种类型:
Int:    整形        四字节
Long:   长整型      八字节
VInt:   变长整形    不定
VLong: 变长长整形 不定
String: 字符串      不定
OutputStream为lucene.net内实现的输出流类
从上面的代码得到segments文件的格式如下:
Format + Version + Counter + segment总数 + (segment名称1 + doc总数1) + (segment名称2 + doc总数2) + (...) + (segment名称n + doc总数n)

// class SegmentInfos
public void Read(Directory directory) {   
InputStream input = directory.OpenFile("segments");
try {
   int format = input.ReadInt();    
   for (int i = input.ReadInt(); i > 0; i--) {
      // read segmentInfos
    SegmentInfo si = new SegmentInfo(input.ReadString(), input.ReadInt(), directory);
    Add(si);
   }
}
finally {
   input.Close();
}
}
读取现有索引文件信息。InputStream为lucene.net内实现的输入流类.

02. 添加文档(AddDocument)

// class IndexWriter
public virtual void AddDocument(Document doc, Analyzer analyzer) {
DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
System.String segmentName = NewSegmentName();
dw.AddDocument(segmentName, doc);
lock (this) {
   segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
   MaybeMergeSegments();
}
}
先构造一个DocumentWriter, 通过NewSegmentName取得一个随机的segment文件名,
然后调用DocumentWriter.AddDocument对文档进行处理,并把内容写入到segment文件中,
最后调用MaybeMergeSegments检查是否合并segment。

// class DocumentWriter
public void AddDocument(System.String segment, Document doc) {
// write Field names
fieldInfos = new FieldInfos();
fieldInfos.Add(doc);
fieldInfos.Write(directory, segment + ".fnm");
   
// write Field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try {
   fieldsWriter.AddDocument(doc);
}
finally {
   fieldsWriter.Close();
}
   
// invert doc into postingTable
postingTable.Clear(); // clear postingTable
fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
   
fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
    float boost = doc.GetBoost();
    for (int i = 0; i < fieldBoosts.Length; i++) {
        fieldBoosts[i] = boost;
    }
   
InvertDocument(doc);
   
// sort postingTable into an array
Posting[] postings = SortPostingTable();
   
// write postings
WritePostings(postings, segment);
   
// write norms of indexed fields
WriteNorms(doc, segment);
}

1. 将字段信息写入.fnm文件中;
// class FieldInfos
public void Write(OutputStream output) {
output.WriteVInt(Size());
for (int i = 0; i < Size(); i++) {
   FieldInfo fi = FieldInfo(i);
   byte bits = (byte) (0x0);
   if (fi.isIndexed)
    bits |= (byte) (0x1);
   if (fi.storeTermVector)
    bits |= (byte) (0x2);
   output.WriteString(fi.name);
   output.WriteByte(bits);
}
}
由上面的代码得到.fnm文件的格式如下:
字段总数 + (字段名1 + 字段标志1) + (字段名2 + 字段标志2) + (...) + (字段名n + 字段标志n)

2. 存储字段内容
// class FieldsWriter
internal void AddDocument(Document doc) {
indexStream.WriteLong(fieldsStream.GetFilePointer());
   
int storedCount = 0;
    foreach (Field field in doc.Fields()) {
   if (field.IsStored())
    storedCount++;
}
fieldsStream.WriteVInt(storedCount);
   
    foreach (Field field in doc.Fields()) {
    if (field.IsStored()) {
       fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));
     
    byte bits = 0;
    if (field.IsTokenized())
     bits |= 1;
    fieldsStream.WriteByte(bits);
     
    fieldsStream.WriteString(field.StringValue());
    }
   }
}
根据字段的IsStored()决断字段内容是否要存储,从上面的代码得到.fdt的格式如下:
要存储的字段总数 + (字段是.fnm中的字段序号1 + 字段标识1 + 字段内容1) + (...) + (字段是.fnm中的字段序号n + 字段标识n + 字段内容n)

3. 倒排文档InvertDocument
// class DocumentWriter
private void InvertDocument(Document doc) {
    foreach(Field field in doc.Fields()) {
   System.String fieldName = field.Name();
   int fieldNumber = fieldInfos.FieldNumber(fieldName);
    
   int length = fieldLengths[fieldNumber]; // length of Field
   int position = fieldPositions[fieldNumber]; // position in Field
    
   if (field.IsIndexed()) {
    if (!field.IsTokenized()) {
     // un-tokenized Field
     AddPosition(fieldName, field.StringValue(), position++);
     length++;
    }
    else {
     System.IO.TextReader reader; // find or make Reader
     if (field.ReaderValue() != null)
      reader = field.ReaderValue();
     else if (field.StringValue() != null)
      reader = new System.IO.StringReader(field.StringValue());
     else
      throw new System.ArgumentException("Field must have either String or Reader value");
      
     // Tokenize Field and add to postingTable
     TokenStream stream = analyzer.TokenStream(fieldName, reader);
     try {
      for (Token t = stream.Next(); t != null; t = stream.Next()) {
       position += (t.GetPositionIncrement() - 1);
       AddPosition(fieldName, t.TermText(), position++);
       if (++length > maxFieldLength)
        break;
      }
     }
     finally {
      stream.Close();
     }
    }
     
    fieldLengths[fieldNumber] = length; // save Field length
    fieldPositions[fieldNumber] = position; // save Field position
    fieldBoosts[fieldNumber] *= field.GetBoost();
   }
}
}
通过analyzer(分析器)对字段内容进行切分,并保存Term信息, 关于analyzer在后续文章中进行分绍.
fieldLengths 存储处理的term总数;
fieldPositions 保存最后一个被处理term的位置;
fieldBoosts

Posting类(包含term, 位置和出现频率).
// class Document.Writer.Posting
sealed class Posting {
// info about a Term in a doc
internal Term term; // the Term
internal int freq; // its frequency in doc
internal int[] positions; // positions it occurs at

internal Posting(Term t, int position)
{
   term = t;
   freq = 1;
   positions = new int[1];
   positions[0] = position;
}
}

保存Posting信息
// class DocumentWriter
private void AddPosition(System.String field, System.String text, int position) {
termBuffer.Set(field, text);
Posting ti = (Posting) postingTable[termBuffer];
if (ti != null) {
   // word seen before
   int freq = ti.freq;
   if (ti.positions.Length == freq) {
    // positions array is full
    int[] newPositions = new int[freq * 2]; // double size
    int[] positions = ti.positions;
    for (int i = 0; i < freq; i++)
    // copy old positions to new
     newPositions[i] = positions[i];
    ti.positions = newPositions;
   }
   ti.positions[freq] = position; // add new position
   ti.freq = freq + 1; // update frequency
}
else {
   // word not seen before
   Term term = new Term(field, text, false);
   postingTable[term] = new Posting(term, position);
}
}
先查找term是否存在,如存在则增加词频freq, 不存在则创建一个Posting。
postingTable是一个Hashtable,用于存储Posting信息,

4. 对Posting进行排序
// class DocumentWriter
private Posting[] SortPostingTable() {
// copy postingTable into an array
Posting[] array = new Posting[postingTable.Count];
System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
for (int i = 0; postings.MoveNext(); i++) {
   array[i] = (Posting) postings.Current;
}

// sort the array
QuickSort(array, 0, array.Length - 1);
return array;
}
用QuickSort(快速排序)对Posting进行排序,通过比较Term进行排序, QuickSort这里就不列出了,请参考相关源码或算法结构。

// class Term
public int CompareTo(Term other) {
if (field == other.field)
// fields are interned
   return String.CompareOrdinal(text, other.text);
else
   return String.CompareOrdinal(field, other.field);
}
从上面可得知,Term是通过字符的数值进序的。

5. 将Posting写入文件
03. 合并Segments

来源:http://blog.esoutong.com/user1/Lucene/archives/2006/796.html

tags:lucene lucene.net dotlucene索引文件 存储 操作 增量索引 IndexWriter Segments

你可能感兴趣的:(Lucene)