首先为了更好的理解下面的代码先看看成员变量:
private static final int CHUNK_SIZE = 2000; private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", Pattern.CASE_INSENSITIVE); private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); private String parserImpl;
length限定在meta tag部分提取,通过正则表达式很容易提取出编码
private static String sniffCharacterEncoding(byte[] content) { int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE; // We don't care about non-ASCII parts so that it's sufficient // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. String str = new String(content, 0, 0, length); Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; if (metaMatcher.find()) { Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } return encoding; }
提取base url
URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParse(getConf()); }
提取encoding:
//直接从content中的metadata中提取 byte[] contentInOctets = content.getContent(); InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets)); String contentType = content.getMetadata().get(Response.CONTENT_TYPE); String encoding = StringUtil.parseCharacterEncoding(contentType); if ((encoding != null) && !("".equals(encoding))) { metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) { metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); if (LOG.isTraceEnabled()) { LOG.trace(base + ": setting encoding to " + encoding); } } } //如果从metadata中没有提取到,使用前面sniffCharacterEncoding从meta tag提取 // sniff out 'charset' value from the beginning of a document if ((encoding == null) || ("".equals(encoding))) { encoding = sniffCharacterEncoding(contentInOctets); if (encoding!=null) { metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) { metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); if (LOG.isTraceEnabled()) { LOG.trace(base + ": setting encoding to " + encoding); } } } } //如果还没有提取到,使用默认的编码 if (encoding == null) { // fallback encoding. // FIXME : In addition to the global fallback value, // we should make it possible to specify fallback encodings for each ccTLD. // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5 // doesn't work for jp because euc-jp and shift_jis have about the // same share) encoding = defaultCharEncoding; metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, defaultCharEncoding); if (LOG.isTraceEnabled()) { LOG.trace(base + ": falling back to " + defaultCharEncoding); } }
设置好编码方式,从content中提取DocumentFragment
input.setEncoding(encoding); if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); } root = parse(input); } catch (IOException e) { return new ParseStatus(e).getEmptyParse(getConf()); } catch (DOMException e) { return new ParseStatus(e).getEmptyParse(getConf()); } catch (SAXException e) { return new ParseStatus(e).getEmptyParse(getConf()); } catch (Exception e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); return new ParseStatus(e).getEmptyParse(getConf()); }
提取meta tag,并检查meta指令
HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); }
提取出outlinks:
if (!metaTags.getNoFollow()) { // okay to follow links ArrayList l = new ArrayList(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag!=null?baseTag:base, l, root); outlinks = (Outlink[])l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl()); } }
构建parse对象:
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setMessage(metaTags.getRefreshHref().toString()); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata); parseData.setConf(this.conf); Parse parse = new ParseImpl(text, parseData); // run filters on parse parse = this.htmlParseFilters.filter(content, parse, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); }
下面这个方法根据parserImpl字段,使用NekoHTML或TagSoup来提取content得到DocumentFragment对象
private DocumentFragment parse(InputSource input) throws Exception { if (parserImpl.equalsIgnoreCase("tagsoup")) return parseTagSoup(input); else return parseNeko(input); }