索引文件里的文件命名有什么规律
引用
_9.cfs
_9.cfx
segments_k
segments.gen
private final synchronized String newSegmentName(){
return "_"+Integer.toString(segmentInfos.counter++,Character.MAX_RADIX);
}
将segmentInfos.counter加1后转为36进制。前面加下划线。所以segmentInfos.counter的值表示了segment中总共文档的数量。
文档倒排。这是建立索引内存消耗最大的时刻。除了词条,还需要存储词条位置,频率等信息
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Holds state for inverting all occurrences of a single
* field in the document. This class doesn't do anything
* itself; instead, it forwards the tokens produced by
* analysis to its own consumer
* (InvertedDocConsumerPerField). It also interacts with an
* endConsumer (InvertedDocEndConsumerPerField).
*/
final class DocInverterPerField extends DocFieldConsumerPerField {
final private DocInverterPerThread perThread;
final private FieldInfo fieldInfo;
final InvertedDocConsumerPerField consumer;
final InvertedDocEndConsumerPerField endConsumer;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
this.perThread = perThread;
this.fieldInfo = fieldInfo;
docState = perThread.docState;
fieldState = perThread.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
}
void abort() {
consumer.abort();
endConsumer.abort();
}
public void processFields(final Fieldable[] fields,
final int count) throws IOException {
fieldState.reset(docState.doc.getBoost());
final int maxFieldLength = docState.maxFieldLength;
final boolean doInvert = consumer.start(fields, count);
for(int i=0;i<count;i++) {
final Fieldable field = fields[i];
// TODO FI: this should be "genericized" to querying
// consumer if it wants to see this particular field
// tokenized.
if (field.isIndexed() && doInvert) {
if (fieldState.length > 0)
fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
final int valueLength = stringValue.length();
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
fieldState.attributeSource = perThread.singleTokenTokenStream;
perThread.localTokenStream.reset();
consumer.start(field);
boolean success = false;
try {
consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.offset += valueLength;
fieldState.length++;
fieldState.position++;
} else { // tokenized field
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
if (streamValue != null)
stream = streamValue;
else {
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
final Reader reader; // find or make Reader
final Reader readerValue = field.readerValue();
if (readerValue != null)
reader = readerValue;
else {
String stringValue = field.stringValue();
if (stringValue == null)
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
perThread.stringReader.init(stringValue);
reader = perThread.stringReader;
}
// Tokenize field and add to postingTable
stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
}
// reset the TokenStream to the first token
stream.reset();
try {
int offsetEnd = fieldState.offset-1;
boolean useNewTokenStreamAPI = stream.useNewAPI();
Token localToken = null;
if (useNewTokenStreamAPI) {
fieldState.attributeSource = stream;
} else {
fieldState.attributeSource = perThread.localTokenStream;
localToken = perThread.localToken;
}
consumer.start(field);
OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
for(;;) {
// If we hit an exception in stream.next below
// (which is fairly common, eg if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
Token token = null;
/**
token.termText 切出的词
token.startOffset 词的起始位置
token.endOffset 词的结束位置
*/
if (useNewTokenStreamAPI) {
if (!stream.incrementToken()) break;
} else {
token = stream.next(localToken);
if (token == null) break;
perThread.localTokenStream.set(token);
}
final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr - 1;
if (posIncr == 0)
fieldState.numOverlap++;
boolean success = false;
try {
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.position++;
offsetEnd = fieldState.offset + offsetAttribute.endOffset();
if (++fieldState.length >= maxFieldLength) {
if (docState.infoStream != null)
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
break;
}
}
fieldState.offset = offsetEnd+1;
} finally {
stream.close();
}
}
fieldState.boost *= field.getBoost();
}
}
consumer.finish();
endConsumer.finish();
}
}