1. 生产者
void addDocuments(Document[] docs) throws IOException { final IndexWriter writer = getIndexWriter(); DynamicPooledExecutor.Command commands[] = new DynamicPooledExecutor.Command[docs.length]; for (int i = 0; i < docs.length; i++) { // check if text extractor completed its work /*尤其是要注意这个方法,这个方法预示着什么,到底是什么呢?这个方法预示着一个document在进入这个方法之前已经触发了文本提取的操作,奇怪哦,其实不奇怪,需要文本提取的document是会二进宫的,这个由消费者逻辑来控制的,不过还是让我们先来看看生产者的逻辑吧。*/ final Document doc = getFinishedDocument(docs[i]); // create a command for inverting the document commands[i] = new DynamicPooledExecutor.Command() { public Object call() throws Exception { long time = System.currentTimeMillis(); writer.addDocument(doc); return new Long(System.currentTimeMillis() - time); } }; } }
Returns a document that is finished with text extraction and is ready to be added to the index
private Document getFinishedDocument(Document doc) throws IOException { /* Util.isDocumentReady(doc)方法非常之十分重要,如果一眼带过(新成语)我们就会错过精彩的细节,正是在这个方法中,我们的提取工作开始了,还记得上一篇文章中的TextExtractorReader#isExtractorFinished方法吗,这个方法会判断,如果开始就等100毫秒,等待返回,否则就返回false,那么返回的flase就是用在了下面的if方法中。代表还没有提取完成。如果没有提取完成,就进入了if 的代码块*/ if (!Util.isDocumentReady(doc)) { /*从这里可以看出,超过100毫秒,那么就创建另外一个document对象,然后把这个原始的document的值拷贝给这个新对象,需要注意的是如果field是LazyTextExtractorField 的话,那么就先把这个field置空*/ Document copy = new Document(); for (Iterator fields = doc.getFields().iterator(); fields.hasNext(); ) { Fieldable f = (Fieldable) fields.next(); Fieldable field = null; Field.TermVector tv = getTermVectorParameter(f); Field.Store stored = getStoreParameter(f); Field.Index indexed = getIndexParameter(f); if (f instanceof LazyTextExtractorField || f.readerValue() != null) { // replace all readers with empty string reader field = new Field(f.name(), new StringReader(""), tv); } else if (f.stringValue() != null) { field = new Field(f.name(), f.stringValue(), stored, indexed, tv); } else if (f.isBinary()) { field = new Field(f.name(), f.binaryValue(), stored); } if (field != null) { field.setOmitNorms(f.getOmitNorms()); copy.add(field); } } // schedule the original document for later indexing /*在这里,生产者终于把原始的document对象加入了indexingQueue队列。*/ Document existing = indexingQueue.addDocument(doc); if (existing != null) { /*如果之前这个nodeId在做索引的时候由于异常原因,jvm退出,那么在redolog和indexingqueuelog中都存在这个nodeid,那么在这个地方,可能就返回一个indexingqueue中已经存在的document了 */ // the queue already contained a pending document for this // node. -> dispose the document Util.disposeDocument(existing); } // use the stripped down copy for now doc = copy; } return doc; }
从上面的逻辑,我们可以看出,一旦一个二进制文本的提取超过100毫秒(默认值,可以修改<param name="extractorTimeout" value="100" />
2. 消费者
Public MultiIndex() { flushTask = new Timer(); flushTask.schedule(new TimerTask() { public void run() { // check if there are any indexing jobs finished /*英语注释写得还是比较清楚的,就是用来检查是否有提取的任务完成了,很显然这个timer背后的线程就是一个消费者,专门用来处理indexingQueue中的数据。接着,让我们到checkIndexingQueue的方法中走走*/ checkIndexingQueue(); // check if volatile index should be flushed checkFlush(); } }, 0, 1000); }
private synchronized void checkIndexingQueue() { /*找到所有提取完成的document的列表,那么如果提出还没有完成,咋办呢,不等待,直接返回new StringReader(""),这个逻辑在TextExtractorReader#isExtractorFinished*/ Document[] docs = indexingQueue.getFinishedDocuments(); Map finished = new HashMap(); for (int i = 0; i < docs.length; i++) { String uuid = docs[i].get(FieldNames.UUID); finished.put(UUID.fromString(uuid), docs[i]); } // now update index with the remaining ones if there are any if (!finished.isEmpty()) { log.debug("updating index with {} nodes from indexing queue.", new Long(finished.size())); // remove documents from the queue for (Iterator it = finished.keySet().iterator(); it.hasNext(); ) { try { indexingQueue.removeDocument(it.next().toString()); } catch (IOException e) { log.error("Failed to remove node from indexing queue", e); } } /*这里又是调用update方法,在前面的文章中,我们已经详细的分析过了update方法会执行哪些重要的操作,他们分别是deleteNode,addNode,flush*/ try { update(finished.keySet().iterator(), finished.values().iterator()); } catch (IOException e) { // update failed log.warn("Failed to update index with deferred text extraction", e); } } }