Lucene IndexWriter updateDocument()过程源码详解
接上一篇SolrUpdateRequest请求更新文档
1.DocumentsWriter-updateDocument() |-->DocumentsWriterThreadState state = getThreadState(doc, delTerm); |获取DocumentsWriterThreadState处理对象 |-->DocumentsWriterThreadState[] threadStates; | 初始值为0 |-->state = (DocumentsWriterThreadState) threadBindings.get(Thread.currentThread()); |从threadBindings中获取当前线程的state对象 |-->if(state == null) |-->minThreadState |根据threadState线程的numThreads值,选择最小的线程 |--> DocumentsWriterThreadState ts = threadStates[i]; if (minThreadState == null || ts.numThreads < minThreadState.numThreads) { minThreadState = ts; } |--> if (minThreadState != null && (minThreadState.numThreads == 0 || threadStates.length >= MAX_THREAD_STATE)) { state = minThreadState; |当minThreadState不为空,且计数为0,并且threadStates值不超过5个的限定值 } |--> else |此时要创造一个新的DocumentsWriterThreadState |--> DocumentsWriterThreadState[] newArray = new DocumentsWriterThreadState[1+threadStates.length]; |创建新的数祖对象 |--> System.arraycopy(threadStates, 0, newArray, 0, threadStates.length); |--> state = newArray[threadStates.length] = new DocumentsWriterThreadState(this); |--> threadStates = newArray; |--> threadBindings.put(Thread.currentThread(), state); |放至线程HashMap当中 |-->waitReady(state); |-->initSegmentName(false); |-->if (delTerm != null) { |如果存在需要删除的Term,添加到DeleteTerm当中 addDeleteTerm(delTerm, state.docState.docID); state.doFlushAfter = timeToFlushDeletes(); } |-->docState.doc = doc; |设置文档状态 docState.analyzer = analyzer; |-->final DocWriter perDoc = state.consumer.processDocument(); |开始责任链处理过程中,处理过程如下 |-->finishDocument(state, perDoc); |结束文档处理 |-->waitQueue.add(skipDocWriter); |-->addDeleteDocID(state.docState.docID); |标记删除的文档ID 2.DocFieldProcessorPerThread-processDocument() |-->consumer.startDocument(); |-->DocInverterPerThread.startDocument(); |-->consumer.startDocument(); |TermsHashPerThread |-->consumer.startDocument(); |-->FreqProxTermsWriterPerThread.startDocument() |do nothing |-->if (nextPerThread != null) |--> nextPerThread.consumer.startDocument(); |TermVectorsTermsWriterPerThread进行处理,donothing |-->endConsumer.startDocument(); |NormsWriterPerThread |-->fieldsWriter.startDocument(); |-->if (doc != null) { |清空原有记录 doc.reset(); doc.docID = docState.docID; } |-->Fieldable field = (Fieldable) docFields.get(i); |针对文档Doc的每个域进行处理 |-->DocFieldProcessorPerField fp = fieldHash[hashPos]; |计算fieldName的hash从fieldHash数祖获取DocFieldProcessorPerField值 |-->while(fp != null && !fp.fieldInfo.name.equals(fieldName)) |循环链表至最后,或者出现fieldName相同时结束 fp = fp.next; |-->if(fp == null) |链表至最后,此时需要生成新的DocFieldProcessorPerField对象 |-->fp = new DocFieldProcessorPerField(this, fi); |生成新DocFieldProcessorPerField对象 |-->fieldHash[hashPos] = fp; |添加至fieldHash列表当中 |--> if (totalFieldCount >= fieldHash.length/2) |如果文档已经超过fieldHash的一半,分配totalFieldCount*2的空间 rehash(); |-->else |此时出现fieldName相同,则只需覆盖原有记录即可 |-->fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTf()); |-->if (fieldCount == fields.length) |调整fields的大小,如果fieldCount与fields相同,则利用System.arraycopy增大容量至fields.length*2 |-->if (fp.fieldCount == fp.fields.length) |调整fp.fields的大小,如果二者相等,则扩大容量至fp.fields.length*2 |-->if (field.isStored()) { |如果域为存储域,则添加至fieldsWriter,后续添加时处理 fieldsWriter.addField(field, fp.fieldInfo); } |--quickSort(fields, 0, fieldCount-1); |利用快排对fields数祖进行排序,排序规则为fieldInfo.name按字母排序 |-->array[lo].fieldInfo.name.compareTo(array[hi].fieldInfo.name) |-->for(int i=0;i<fieldCount;i++) |用consumer进行责任链的下一链DocInverterPerField的处理 fields[i].consumer.processFields(fields[i].fields, fields[i].fieldCount); |-->final DocumentsWriter.DocWriter one = fieldsWriter.finishDocument(); |完成文档处理过程中 final DocumentsWriter.DocWriter two = consumer.finishDocument(); |-->最后返回PerDoc文档 |-->PerDoc both = getPerDoc(); |-->both.one = one; |-->both.two = two; |-->return both; 3.DocInverterPerField-processFields() |-->fieldState.reset(docState.doc.getBoost()); |设置fieldState初始值 |-->final int maxFieldLength = docState.maxFieldLength; |设置最大文档数 |-->doInvert = consumer.start(fields, count); |consumer消费者TermsHashPerField 开始处理文档,是否需要倒排 |--> for(int i=0;i<count;i++) { |逐个处理Doc文档的各个域 |-->final Fieldable field = fields[i]; |-->if(field.isIndexed() && doInvert) |倒排并且需要索引的域 |-->if (fieldState.length > 0) |设置position的Gap间隔 |-->fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); |-->if (!field.isTokenized()) |不需要分词 |-->String stringValue = field.stringValue(); |获取field的域值 |-->perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength); |用singleTokenTokenStream分词 |-->consumer.start(field); |TermsHashPerField开始处理文档 |-->consumer.add(); |TermsHashPerField处理文档 |-->final char[] tokenText = termAtt.termBuffer(); |获取termAtt的term值及长度 final int tokenTextLen = termAtt.termLength(); |--> code = ((code*31) + ch)*31+ch2; |计算当前term的posting列表的hash值,实现方式类似String取hash值 |--> p = postingsHash[hashPos]; |在postingHash列表取当前RawPostingList对象 |--> p != null && !postingEquals(tokenText, tokenTextLen) |链表最后,或者相同名称时结束循环,下面继续处理 |--> if(p == null) | 此时需要新建一个RawPostingList对象 |-->p = perThread.freePostings[--perThread.freePostingsCount]; |从空闲的list集中获取RawPostingList对象 |-->RawPostingList 三个值bytePool,intPool,charPool |-->charPool:存储的是token的string转换后的char[]数据 |-->final char[] text = charPool.buffer; final int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; |-->bytePool: 用于存放每个token的position和freq信息 |-> p.byteStart = intUptos[intUptoStart]; |-->intPool:用于指向Token 在bytePool 中freq 和prox 信息的偏移量 |-->intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; |-->postingsHash[hashPos] = p; |赋值给倒排表 |-->consumer.newTerm(p); |生成新的Term对象 FreqProxTermsWriterPerField进行文档处理 |-->FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; | p.lastDocID = docState.docID; writeProx(p, fieldState.position); |-->if(payload != null && payload.length > 0) |termsHashPerField 写payload信息 |-->else |-->termsHashPerField.writeVInt(1, proxCode<<1); |-->else |-->consumer.addTerm(p); |添加一个新的Term |-->if (omitTermFreqAndPositions) |需要不需要提交词频和位置信息 |-->termsHashPerField.writeVInt(0, p.lastDocCode); |-->else |--> if (docState.docID != p.lastDocID) |如果不为最后一个文档,则为新文档 |-->if (1 == p.docFreq) termsHashPerField.writeVInt(0, p.lastDocCode|1); |-->else |-->termsHashPerField.writeVInt(0, p.lastDocCode); |-->termsHashPerField.writeVInt(0, p.docFreq); |--> else |-->p.docFreq++; |-->writeProx(p, fieldState.position-p.lastPosition); |-->else |存在分词,切分域field |-->final TokenStream streamValue = field.tokenStreamValue(); |-->if (streamValue != null) |-->stream = streamValue; |-->else |-->final Reader readerValue = field.readerValue(); |-->perThread.stringReader.init(stringValue); |readerValue值为null的时候则新生成一个reader,并初始化 |-->boolean hasMoreTokens = stream.incrementToken(); |从tokenStream逐个获取下一个term,并设置位置信息 |-->OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); |-->consumer.add(); |TermsHashPerField开始添加,过程与上相同 |-->consumer.finish(); |结束TermsHashPerField文件 |-->endConsumer.finish(); |结束NormsWriterPerField文件
附:token存储方式
关键词 文章号[出现频率] 出现位置
guangzhou 1[2] 3,6
he 2[1] 1
i 1[1] 4
live 1[2],2[1] 2,5,2
shanghai 2[1] 3
tom 1[1] 1
eg: live 1[2],2[1] 2,5,2 表示文档doc Id为1.2的文档分别有二个和一个token,存储位置依次为2.5.2