Lucene删除分为通过Term删除和通过Query删除,他们的机制基本相同,我们以Term删除为列说明。
先从第一步看起,Lucene的删除是通过IndexWriter
来发起的:
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
/**
* 事件队列, 存放
* {@link DeleteNewFilesEvent}
* {@link ApplyDeletesEvent}
* {@link DeleteNewFilesEvent}
* {@link FlushFailedEvent}
* 最后根据事件类型来做相应处理, 比如处理删除
* @see #processEvents(boolean, boolean)
*/
private final Queue<Event> eventQueue;
/**
* 全局删除, 将待删除的数据放入 {@link DocumentsWriter#deleteQueue}中
*
*/
public long deleteDocuments(Term... terms) throws IOException {
ensureOpen();
try {
// 如果在添加当前删除时,内存中新的doc数据超过16MB, 那么会触发segment的flush过程,
// 这个过程会先处理Delete相关数据, 因为设置了一个 ApplyDeletesEvent 事件
// 用seqNo < 0来标识触发flush
long seqNo = docWriter.deleteTerms(terms);
if (seqNo < 0) {
seqNo = -seqNo;
// seqNo<0, 表示触发了segment的flush过程,也应该有ApplyDeletesEvent实例(看下文), 处理删除事件
processEvents(true, false);
}
return seqNo;
} catch (VirtualMachineError tragedy) {
tragicEvent(tragedy, "deleteDocuments(Term..)");
// dead code but javac disagrees:
return -1;
}
}
}
上述代码最关键的就是
long seqNo = docWriter.deleteTerms(terms);
这一步了,我们深入这行代码:
final class DocumentsWriter implements Closeable, Accountable {
/**
* 存储Term删除和Query删除的数据
*/
volatile DocumentsWriterDeleteQueue deleteQueue;
synchronized long deleteTerms(final Term... terms) throws IOException {
// TODO why is this synchronized?
final DocumentsWriterDeleteQueue deleteQueue = this.deleteQueue;
// 存储Term删除的数据, 返回当前总共有多少个删除,包含Term和Query
long seqNo = deleteQueue.addDelete(terms);
// 根据当前条件判断是否要触发flush, 主要是当前内存中新的数据是否有16MB。
// 如果触发了会将 DocumentsWriterFlushControl#fullFlush设置为true, 这样flush时通过这个标识判断
flushControl.doOnDelete();
lastSeqNo = Math.max(lastSeqNo, seqNo);
//如果触发了flush, 也就是内存中新的数据超过16MB, 就会尝试处理所有的Delete
if (applyAllDeletes(deleteQueue)) {
seqNo = -seqNo;
}
return seqNo;
}
/**
* If buffered deletes are using too much heap, resolve them and write disk and return true.
*/
private boolean applyAllDeletes(DocumentsWriterDeleteQueue deleteQueue) throws IOException {
// 如果触发了flush, 会尝试处理所有的删除,就是判断 DocumentsWriterFlushControl#fullFlush
if (flushControl.getAndResetApplyAllDeletes()) {
if (deleteQueue != null) {
// 将当前所有删除数据封装下放进 DocumentsWriterFlushQueue#queue里
ticketQueue.addDeletes(deleteQueue);
}
// 设置一个处理删除的事件, 这是处理删除的触发点,在添加删除操作的最后会处理所有的事件
putEvent(ApplyDeletesEvent.INSTANCE); // apply deletes event forces a purge
return true;
}
return false;
}
}
class DocumentsWriterFlushQueue {
/**
* 待Flush的更新和删除数据
*/
private final Queue<FlushTicket> queue = new LinkedList<>();
/**
* 将所有缓冲的待删除和更新的数据添加到Queue中
*
* @param deleteQueue
* @throws IOException
*/
void addDeletes(DocumentsWriterDeleteQueue deleteQueue) throws IOException {
synchronized (this) {
incTickets();// first inc the ticket count - freeze opens
// a window for #anyChanges to fail
boolean success = false;
try {
queue.add(new GlobalDeletesTicket(deleteQueue.freezeGlobalBuffer(null)));
success = true;
} finally {
if (!success) {
decTickets();
}
}
}
}
}
从上述代码和注释中就能看出一点端倪,每次Term删除都可能触发Flush和Merge过程,如果触发了,设置一个ApplyDeletesEvent,
同时将所有待处理的删除和更新放进 DocumentsWriterFlushQueue #queue 里,之后Merge和Flush时会对queue里的所有数据做相应处理。
上述代码需要深入的就是:
long seqNo = deleteQueue.addDelete(terms);
这一步了,我们深入此行代码:
final class DocumentsWriterDeleteQueue implements Accountable {
/**
* 每个DWPT在添加更新操作时都会将tail置为最新的删除节点,
* 然后 {@link #globalSlice} 判断如果自己的 sliceTail 和 此tail不一致, 那么更新 sliceTail指向此tail,
* 然后将此节点添加到 {@link #globalBufferedUpdates} 中
* the current end (latest delete operation) in the delete queue:
*/
private volatile Node<?> tail;
/**
* 临时持有删除Node的链表
*/
private final DeleteSlice globalSlice;
/**
* 最终存放删除和更新的缓冲区
*/
private final BufferedUpdates globalBufferedUpdates;
/**
* 删除包含此term的所有doc,仅仅是添加这个删除Node
*
* @param terms
* @return
*/
long addDelete(Term... terms) {
long seqNo = add(new TermArrayNode(terms));
tryApplyGlobalSlice();
return seqNo;
}
/**
* 更新tail执行最新的node
*
* @param newNode
* @return
*/
synchronized long add(Node<?> newNode) {
tail.next = newNode;
this.tail = newNode;
return getNextSequenceNumber();
}
/**
* 尝试处理全局片段
* 在更新{@link #tail} 时需要锁定
*/
void tryApplyGlobalSlice() {
if (globalBufferLock.tryLock()) {
/*
* The global buffer must be locked but we don't need to update them if
* there is an update going on right now. It is sufficient to apply the
* deletes that have been added after the current in-flight global slices
* tail the next time we can get the lock!
*/
try {
// 判断 tail 是否被更新了 , 如果是 将 globalSlice 里的 sliceTail 更新为 当前属性 tail, 然后将tail添加到 globalBufferedUpdates中
if (updateSliceNoSeqNo(globalSlice)) {
globalSlice.apply(globalBufferedUpdates, BufferedUpdates.MAX_INT);
}
} finally {
globalBufferLock.unlock();
}
}
}
/**
* Just like updateSlice, but does not assign a sequence number
*/
boolean updateSliceNoSeqNo(DeleteSlice slice) {
if (slice.sliceTail != tail) {
// new deletes arrived since we last checked
// 将tail赋值给 globalSlice 的 sliceTail
slice.sliceTail = tail;
return true;
}
return false;
}
}
上述代码中比较关键的就是这两行了:
if (updateSliceNoSeqNo(globalSlice)) {
globalSlice.apply(globalBufferedUpdates, BufferedUpdates.MAX_INT);
}
updateSliceNoSeqNo(globalSlice)
:这一步是把当前增加的删除Node赋值个globalSlice的sliceTail
globalSlice.apply(globalBufferedUpdates, BufferedUpdates.MAX_INT)
:这一步就是把 globalSlice 里的新增加的删除Node转移到 globalBufferedUpdates里:
static class DeleteSlice {
Node<?> sliceHead;
Node<?> sliceTail;
void apply(BufferedUpdates del, int docIDUpto) {
if (sliceHead == sliceTail) {
// 0 length slice
return;
}
Node<?> current = sliceHead;
do {
current = current.next;
assert current != null : "slice property violated between the head on the tail must not be a null node";
// 将sliceHead 和 sliceTail 之间的Node 追加到 globalBufferedUpdates 里
current.apply(del, docIDUpto);
} while (current != sliceTail);
reset();
}
void reset() {
// Reset to a 0 length slice
sliceHead = sliceTail;
}
}
private static final class TermArrayNode extends Node<Term[]> {
@Override
void apply(BufferedUpdates bufferedUpdates, int docIDUpto) {
for (Term term : item) {
// globalBufferedUpdates 添加 Term删除Node
bufferedUpdates.addTerm(term, docIDUpto);
}
}
}
对上述代码做个总结:在通过Term删除时,如果当前内存中的Doc相关数据超过16MB,会触发Flush和Merge操作。会将当前所有的待执行的删除操作放进 DocumentsWriterFlushQueue#queue
属性里,之后再执行Flush和Merge是会将这些删除数据应用到。
删除入口里的添加删除Node的操作流程:docWriter.deleteTerms(terms)
讲解完了,接下来就是如果触发了Flush和Merge的 processEvents(true, false)
ApplyDeletesEvent 的事件处理和Flush,Merge操作。
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
public long deleteDocuments(Term... terms) throws IOException {
ensureOpen();
try {
// 如果在添加当前删除时,内存中新的doc数据超过16MB, 那么会触发segment的flush过程,
// 这个过程会先处理Delete相关数据, 因为设置了一个 ApplyDeletesEvent 事件
// 用seqNo < 0来标识触发flush
long seqNo = docWriter.deleteTerms(terms);
if (seqNo < 0) {
seqNo = -seqNo;
// 如果触发了Flush和Merge, 先发布删除数据,然后Flush和Merge
processEvents(true, false);
}
return seqNo;
} catch (VirtualMachineError tragedy) {
tragicEvent(tragedy, "deleteDocuments(Term..)");
// dead code but javac disagrees:
return -1;
}
}
/**
* 处理事件, 根据事件类型不同做不同处理, 此处就是处理删除事件:ApplyDeletesEvent
*
* @param triggerMerge
* @param forcePurge
* @throws IOException
*/
private void processEvents(boolean triggerMerge, boolean forcePurge) throws IOException {
// 处理所有事件,可能就包含 ApplyDeletesEvent
processEvents(eventQueue, triggerMerge, forcePurge);
if (triggerMerge) {
// 根据当前MergePolicy来判断是否要执行segment的merge过程
maybeMerge(getConfig().getMergePolicy(), MergeTrigger.SEGMENT_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
}
}
在 ApplyDeletesEvent
的处理过程中,最终会调用下列函数:
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
/**
* 发布已经冻结了的更新和删除,将其放入 {@link BufferedUpdatesStream#updates} 里, 这样在Flush和Merge是才能使用
*
*
* @param packet
* @throws IOException
*/
synchronized void publishFrozenUpdates(FrozenBufferedUpdates packet) throws IOException {
assert packet != null && packet.any();
bufferedUpdatesStream.push(packet);
docWriter.putEvent(new DocumentsWriter.ResolveUpdatesEvent(packet));
}
}
class BufferedUpdatesStream implements Accountable {
/**
* 所有冻结过的删除和更新操作
*/
private final Set<FrozenBufferedUpdates> updates = new HashSet<>();
public synchronized long push(FrozenBufferedUpdates packet) {
......
// 将冻结过的所有删除和更新操作添加进 updates 里
updates.add(packet);
numTerms.addAndGet(packet.numTermDeletes);
......
return packet.delGen();
}
}
Lucene在进行Merge时,会执行到IndexWriter的mergeInit
方法:
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
/**
* 处理所有更新的Stream, 跟踪 {@link FrozenBufferedUpdates}
* 看类注释
*/
final BufferedUpdatesStream bufferedUpdatesStream;
/**
* Does initial setup for a merge, which is fast but holds
* the synchronized lock on IndexWriter instance.
*/
final void mergeInit(MergePolicy.OneMerge merge) throws IOException {
// Make sure any deletes that must be resolved before we commit the merge are complete:
// 所有的删除和更新需要在merge前被处理好, 此步骤就是对每个segment应用删除和更新, 包括term的, query的和 DocValues的
bufferedUpdatesStream.waitApplyForMerge(merge.segments);
boolean success = false;
try {
_mergeInit(merge);
success = true;
} finally {
if (!success) {
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "hit exception in mergeInit");
}
mergeFinish(merge);
}
}
}
}
这里的 bufferedUpdatesStream
就是上文处理 ApplyDeletesEvent
是使用到的,其将所有的待处理的删除和更新都加入到 BufferedUpdatesStream 的 updates
里 , 这里就是使用到了这些数据。
class BufferedUpdatesStream implements Accountable {
/**
* 所有冻结过的删除和更新操作
*/
private final Set<FrozenBufferedUpdates> updates = new HashSet<>();
private final IndexWriter writer;
public void waitApplyForMerge(List<SegmentCommitInfo> mergeInfos) throws IOException {
......
Set<FrozenBufferedUpdates> waitFor = new HashSet<>();
synchronized (this) {
// 将updates里的数据设置到waitFor里
for (FrozenBufferedUpdates packet : updates) {
if (packet.delGen() <= maxDelGen) {
// We must wait for this packet before finishing the merge because its
// deletes apply to a subset of the segments being merged:
waitFor.add(packet);
}
}
}
// 处理这些删除和更新数据
waitApply(waitFor);
}
/**
* 等待每个DWPT的更新都被处理
*
* @param waitFor
* @throws IOException
*/
private void waitApply(Set<FrozenBufferedUpdates> waitFor) throws IOException {
......
for (FrozenBufferedUpdates packet : waitFor) {
// 每个缓冲更新包都被IndexWriter处理
packet.apply(writer);
}
}
}
这样,每个 FrozenBufferedUpdates
里的删除和更新数据就能对 IndexWriter
(里的所有segment) 其作用了。
class FrozenBufferedUpdates {
public synchronized void apply(IndexWriter writer) throws IOException {
......
// 获取一个Merge里的所有segment数据
BufferedUpdatesStream.SegmentState[] segStates = writer.bufferedUpdatesStream.openSegmentStates(writer.readerPool, infos, seenSegments,delGen());
......
// don't hold IW monitor lock here so threads are free concurrently resolve deletes/updates:
// 不持有IndexWriter的锁,这样其他线程就能自由的并发处理deletes/updates
delCount = apply(segStates);
}
/**
* 应用TermDeletes,QueryDeletes,DocValuesUpdates
* Applies pending delete-by-term, delete-by-query and doc values updates to all segments in the index, returning
* the number of new deleted or updated documents.
*/
private synchronized long apply(BufferedUpdatesStream.SegmentState[] segStates) throws IOException {
if (delGen == -1) {
// we were not yet pushed
throw new IllegalArgumentException("gen is not yet set; call BufferedUpdatesStream.push first");
}
assert applied.getCount() != 0;
if (privateSegment != null) {
assert segStates.length == 1;
assert privateSegment == segStates[0].reader.getSegmentInfo();
}
// 处理各种更新
totalDelCount += applyTermDeletes(segStates);
totalDelCount += applyQueryDeletes(segStates);
totalDelCount += applyDocValuesUpdates(segStates);
return totalDelCount;
}
}
这样在merge之前,每个segment的数据都通过了删除和更新的筛选,留下的都是需要的数据,flush到磁盘的都是存活的数据。
对上述流程做个总结:
ApplyDeletesEvent
事件,在处理这个事件时,会则将全局链表中的数据冻结,也就是不允许在做添加了,然后追加BufferedUpdatesStream
的一个冻结数据集合里