Nutch学习笔记13---以某网站为例写解析插件

编写自己的HtmlParseFilter---sohu

进入到文件夹 $NUTCH_HOME/src/plugin

mkdir    htmlparsefilter-sohu

按照下面的结构建立目录和文件

htmlparsefilter-sohu/

  plugin.xml

  build.xml

  ivy.xml

  src/

    java/

      org/

        apache/

          nutch/

            parse/

              HtmlParseFilterSohu.java

修改plugin.xml

<?xml version="1.0" encoding="UTF-8"?>

<plugin id="htmlparsefilter-sohu" name="Add Sohu Field to Doc"

    version="1.0.0"   provider-name="nutch.org">

 

   <runtime>

     <library name="htmlparsefilter-sohu.jar">

       <export name="*"/>

     </library>

   </runtime>

   <requires>

   </requires>

   <extension id="org.apache.nutch.parse.parse_sohu"

       name="Add sohu Field to doc"

       point="org.apache.nutch.parse.HtmlParseFilter">

     <implementation id="HtmlParseFilterSohu"

         class="org.apache.nutch.parse.HtmlParseFilterSohu"/>

   </extension>

</plugin>

关于ivy.xml

plugin/index-basic下面复制对应的ivy.xml,不需要任何改变即可。

关于build.xml

修改成以下内容

<?xml version="1.0" encoding="UTF-8"?>

<project name="htmlparsefilter-sohu" default="jar">

  <import file="../build-plugin.xml"/>

</project>

修改 HtmlParseFilterSohu.java-以实际需求为准-以实际代码为准

修改src/plugin/build.xml

找到

<!-- ====================================================== -->

  <!-- Build & deploy all the plugin jars.                    -->

  <!-- ====================================================== -->

在下面添加一行

<ant dir="htmlparsefilter-sohu" target="deploy"/>

修改nutch-site.xml

我的运行在local模式下,则修改配置文件local/conf/nutch-site.xml如下

nutch-default.xml中复制plugin.includes的配置块

<property>

  <name>plugin.includes</name>

  <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>

  <description> </description>

</property>

nutch-site.xml

然后修改复制后的内容

<property>

  <name>plugin.includes</name>

<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|htmlparsefilter-sohu</value>

 <description> </description>

</property>

就可以了。

最后一步-修改$NUTCH_HOME/conf/schema.xml 

<fields>...</fields>段内添加

<field name="pageLength"type="long"stored="true"indexed="true"/>

10 重新ant , 大功告成。

 

java文件如下:

package org.apache.nutch.parse;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

public class HtmlParseFilterSohu implements HtmlParseFilter {

	private static final Log LOG = LogFactory.getLog(HtmlParseFilterSohu.class);
	private Configuration conf;
	private ParseResult _parseResult;
	private Content _content;

	private static final String A = "a";
	private static final String ACTOR = "actor";
	private static final String ALBUM = "album";
	private static final String AREABOX = "areabox";
	private static final String CATEGORY = "category";
	private static final String CLASS = "class";
	private static final String CONTENT = "content";
	private static final String CONTENT_LOCATION = "contentLocation";
	private static final String CRUMBS = "crumbs";
	private static final String CRUMBSBAR = "crumbsBar";
	private static final String DATA_SUBSCRIBE_CATEGORYNAME = "data-subscribe-categoryname";
	private static final String DATE_PUBLISHED = "datePublished";
	private static final String DESCRIPTION = "description";
	private static final String DIRECTOR = "director";
	private static final String DIV = "div";
	private static final String DURATION = "duration";
	private static final String FULLDESCRIPTION = "full_desc";
	private static final String GENRE = "genre";
	private static final String H = "h";
	private static final String H2 = "h2";
	private static final String ID = "id";
	private static final String INTRO = "intro";
	private static final String INFO_INFO_CON = "info info-con";
	private static final String IRALBUMNAME = "irAlbumName";
	private static final String IRCATEGORY = "irCategory";
	private static final String IRTITLE = "irTitle";
	private static final String ITEM = "item";
	private static final String ITEMPROP = "itemprop";
	private static final String KEYWORDS = "keywords";
	private static final String LABEL = "label";
	private static final String LI = "li";
	private static final String LINK = "link";
	private static final String MAINACTOR = "mainactor";
	private static final String NAME = "name";
	private static final String OG_IMAGE = "og:image";
	private static final String P = "p";
	private static final String PEOS_INFO = "peos-info";
	private static final String PROPERTY = "property";
	private static final String PUB = "pub";
	private static final String S_H = "s h";
	private static final String SCRIPT = "script";
	private static final String SERIES = "series";
	private static final String SPAN = "span";
	private static final String STYLE = "style";
	private static final String THUMB_NAIL_URL = "thumbnailUrl";
	private static final String TITLE = "title";
	private static final String TYPE = "type";
	private static final String MVINFO = "mvInfo"; // just a mv
	private static final String META = "meta";
	private static final String UL = "ul";
	// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~url about
	private static final String URL_DOMAIN = ".sohu.com/";
	private static final String URL_SUFFIX = ".shtml";
	private static final String URL_SLASH = "/";

	// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~parse_data
	public static final String NUTCH_VIDEO_FULL_DESCRIPTION = "desc";
	public static final String NUTCH_VIDEO_SERIES = "series";
	public static final String NUTCH_VIDEO_YEAR = "year";
	public static final String NUTCH_VIDEO_AREA = "area";
	public static final String NUTCH_VIDEO_TYPE = "type";
	public static final String NUTCH_VIDEO_DIRECTOR = "director";
	public static final String NUTCH_VIDEO_ACTOR = "actor";
	public static final String NUTCH_VIDEO_TITLE = "title";
	public static final String NUTCH_VIDEO_CHANNEL = "channel";
	public static final String NUTCH_VIDEO_KEYWORD = "keyword";
	public static final String NUTCH_VIDEO_URL = "url";
	public static final String NUTCH_VIDEO_PICTURE = "picture";
	public static final String NUTCH_VIDEO_WEBSITE = "website";
	public static final String NUTCH_VIDEO_TIMESPAN = "timespan";

	private boolean isNumber(String segment) {
		int length = segment.length();
		int index = 0;
		for (index = 0; index < length; index++) {
			char c = segment.charAt(index);
			if (c >= '0' && c <= '9') {

			} else {
				break;
			}
		}
		if (index >= length) {
			return true;
		} else {
			return false;
		}
	}

	private boolean isValidStr(String segment) {
		int length = segment.length();
		int index = 0;
		for (; index < length; index++) {
			char c = segment.charAt(index);
			if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
					|| (c >= '0' && c <= '9')) {

			} else {
				break;
			}
		}
		if (index >= length) {
			return true;
		} else {
			return false;
		}
	}

	private boolean isValidUrl(String url) {
		if (null == url) {
			return false;
		}
		int firstIndex, secondIndex, thirdIndex;

		firstIndex = url.indexOf(URL_DOMAIN);
		if (-1 == firstIndex) {
			return false;
		}
		firstIndex += URL_DOMAIN.length();

		secondIndex = url.indexOf(URL_SLASH, firstIndex);
		if (-1 == secondIndex) {
			return false;
		}
		int length = secondIndex - firstIndex;
		if (8 != length) {
			return false;
		}
		String str = url.substring(firstIndex, secondIndex);
		if (false == this.isNumber(str)) {
			return false;
		}

		secondIndex += URL_SLASH.length();
		thirdIndex = url.indexOf(URL_SUFFIX, secondIndex);
		if (-1 == thirdIndex) {
			return false;
		}
		str = url.substring(secondIndex, thirdIndex);
		if (false == this.isValidStr(str)) {
			return false;
		}

		return true;
	}

	private String getKeyValue(String key) {
		if (null == _parseResult || null == _content
				|| null == _content.getUrl()) {
			return null;
		}

		ParseData parseData = _parseResult.get(_content.getUrl()).getData();
		if (null == parseData) {
			return null;
		}
		Metadata metadata = parseData.getParseMeta();
		if (null == metadata) {
			return null;
		}
		return metadata.get(key);
	}

	/*
	 * private void setKeyValue(String key, String value) { if (null ==
	 * _parseResult || null == _content || null == _content.getUrl()) { return;
	 * }
	 * 
	 * ParseData parseData = _parseResult.get(_content.getUrl()).getData(); if
	 * (null == parseData) { return; } Metadata metadata =
	 * parseData.getParseMeta(); if (null == metadata) { return; }
	 * metadata.set(key, value); }
	 */

	private void appendKeyValue(String key, String value) {
		// used when exist more block...
		if (null == _parseResult || null == _content
				|| null == _content.getUrl()) {
			return;
		}

		ParseData parseData = _parseResult.get(_content.getUrl()).getData();
		if (null == parseData) {
			return;
		}
		Metadata metadata = parseData.getParseMeta();
		if (null == metadata) {
			return;
		}
		String oldValue = metadata.get(key);
		if (null == oldValue) {
			metadata.set(key, value);
		} else {
			metadata.set(key, oldValue + " " + value);
		}
		LOG.info("[" + key + "]   [" + metadata.get(key) + "]");

	}

	private String replaceWithRegex(String str, String regEx, String replace) {
		Pattern p = Pattern.compile(regEx);
		Matcher m = p.matcher(str);
		str = m.replaceAll(replace).trim();
		return str;
	}

	private Element getFirstChildNodeWithTagName(Element parent, String tagName) {
		Element result = null;
		if (null == parent) {
			return null;
		}
		// not include itself...
		NodeList nodeList = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < nodeList.getLength(); index++) {
			Node child = nodeList.item(index);
			if (child instanceof Element) {
				Element childElement = (Element) child;
				String childTagName = childElement.getTagName();
				if (null != childTagName) {
					childTagName = childTagName.toLowerCase().trim();
					if (childTagName.equals(tagName)) {
						return childElement;
					}
				}
			}
		}
		return null;
	}

	private Element getFirstChildNodeWithTagNameFixedAttributeLength(
			Element parent, String tagName, int n) {
		Element result = null;
		if (null == parent) {
			return null;
		}
		// not include itself...
		NodeList nodeList = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < nodeList.getLength(); index++) {
			Node child = nodeList.item(index);
			if (child instanceof Element) {
				Element childElement = (Element) child;
				String childTagName = childElement.getTagName();
				if (null != childTagName) {
					childTagName = childTagName.toLowerCase().trim();
					if (childTagName.equals(tagName)) {
						if (childElement.getAttributes().getLength() == n)
							return childElement;
					}
				}
			}
		}
		return null;
	}

	private Element getFirstChildNodeWithAttribute(Element parent,
			String attributeName, String attributeValue) {
		Element result = null;
		if (null == parent) {
			return null;
		}
		// not include itself...
		NodeList nodeList = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < nodeList.getLength(); index++) {
			Node child = nodeList.item(index);
			if (child instanceof Element) {
				Element childElement = (Element) child;
				String value = childElement.getAttribute(attributeName);
				if (null != value) {
					value = value.trim();
					if (value.equals(attributeValue)) {
						return childElement;
					}
				}
			}
		}
		return null;
	}

	private Element getFirstDescendantWithAttribute(Node parent,
			String attributeName, String attributeValue) {
		// include itself...
		Element result = null;
		if (null == parent) {
			return null;
		}
		if (parent instanceof Element) {
			Element element = (Element) parent;
			String value = element.getAttribute(attributeName);
			if (null != value && value.equals(attributeValue)) {
				return element;
			}
		}
		// find from all child...
		NodeList children = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < children.getLength(); index++) {
			Node child = children.item(index);
			Element found = getFirstDescendantWithAttribute(child,
					attributeName, attributeValue);
			if (null != found) {
				return found;
			}
		}
		// not find...
		return null;
	}

	private Element getFirstDescendantWithTag(Node parent, String tagName) {
		// include itself...
		Element result = null;
		if (null == parent) {
			return null;
		}
		if (parent instanceof Element) {
			Element element = (Element) parent;
			String value = element.getTagName();
			if (null != value && value.toLowerCase().equals(tagName)) {
				return element;
			}
		}
		// find from all child...
		NodeList children = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < children.getLength(); index++) {
			Node child = children.item(index);
			Element found = getFirstDescendantWithTag(child, tagName);
			if (null != found) {
				return found;
			}
		}
		// not find...
		return null;
	}

	private Element getFirstDescendantWithTagPlusAttribute(Node parent,
			String tagName, String attributeName, String attributeValue) {
		// include itself...
		Element result = null;
		if (null == parent) {
			return null;
		}
		if (parent instanceof Element) {
			Element element = (Element) parent;
			String tag = element.getTagName();
			if (null != tag && tag.toLowerCase().equals(tagName)) {
				String attrValue = element.getAttribute(attributeName);
				if (null != attrValue && attrValue.equals(attributeValue)) {
					return element;
				}
			}
		}
		// find from all child...
		NodeList children = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < children.getLength(); index++) {
			Node child = children.item(index);
			Element found = getFirstDescendantWithTagPlusAttribute(child,
					tagName, attributeName, attributeValue);
			if (null != found) {
				return found;
			}
		}
		// not find...
		return null;
	}

	private Element getFirstDescendantWithTagPlusAttributeFixedAttribute(
			Node parent, String tagName, String attributeName,
			String attributeValue, int n) {
		// include itself...
		Element result = null;
		if (null == parent) {
			return null;
		}
		if (parent instanceof Element) {
			Element element = (Element) parent;
			String tag = element.getTagName();
			if (null != tag && tag.toLowerCase().equals(tagName)) {
				String attrValue = element.getAttribute(attributeName);
				if (null != attrValue && attrValue.equals(attributeValue)) {

					if (null == element.getAttributes()) {
						if (element.getAttributes().getLength() == n)
							return element;
					}

				}
			}
		}
		// find from all child...
		NodeList children = parent.getChildNodes();
		int index = 0;
		for (index = 0; index < children.getLength(); index++) {
			Node child = children.item(index);
			Element found = getFirstDescendantWithTagPlusAttribute(child,
					tagName, attributeName, attributeValue);
			if (null != found) {
				return found;
			}
		}
		// not find...
		return null;
	}

	private void walk(Node node) {
		if (null == node) {
			return;
		}
		short nodeType = node.getNodeType();
		if (nodeType == Node.DOCUMENT_FRAGMENT_NODE) {

			NodeList children = node.getChildNodes();
			for (int i = 0; children != null && i < children.getLength(); i++) {
				walk(children.item(i));
			}

		} else if (nodeType == Node.ELEMENT_NODE) {

			Element element = (Element) node;
			String tag = element.getTagName();
			if (null == tag) {
				return;
			}
			tag = tag.toLowerCase();
			if (SCRIPT.equals(tag) || STYLE.equals(tag) || LINK.equals(tag)) {
				// <script>...</script>
				// <style>...</style>
				// <link>...</link>
				return;
			} else if (META.equals(tag)) {
				String value = element.getAttribute(NAME);

				if (null != value && value.equals(ALBUM)) {
					// album
					value = element.getAttribute(CONTENT);
					if (null != value && value.length() > 0) {
						this.appendKeyValue(NUTCH_VIDEO_SERIES, value);
					}
				} else if (null != value && value.equals(CATEGORY)) {
					// category
					value = element.getAttribute(CONTENT);
					if (null != value && value.length() > 0) {
						this.appendKeyValue(NUTCH_VIDEO_CHANNEL, value);
					}
				}

				value = element.getAttribute(PROPERTY);
				// image
				if (null != value && value.equals(OG_IMAGE)) {
					value = element.getAttribute(CONTENT);
					if (null != value && value.length() > 0) {
						this.appendKeyValue(NUTCH_VIDEO_PICTURE, value);
					}
				}

			} else if (DIV.equals(tag)) {

				String value = element.getAttribute(ID);
				if (null != value && value.equals(CRUMBSBAR)) {
					Element h2 = this.getFirstDescendantWithTag(element, H2);
					if (null != h2) {
						String text = h2.getTextContent();
						if (null != text && text.length() > 0) {
							text = this.replaceWithRegex(text, "[\\s\\t/()]+",
									" ");
							text = text.trim();
							if (text.length() > 0)
								this.appendKeyValue(NUTCH_VIDEO_TITLE, text);
						}
					}

					return;
				}

				// ///////////////////////////////////////////////////////////////

				value = element.getAttribute(CLASS);
				if (null != value && value.equals(INFO_INFO_CON)) {

					Element mainactor = this
							.getFirstDescendantWithTagPlusAttribute(element,
									LI, ID, MAINACTOR);
					if (null != mainactor) {
						Element actor = this.getFirstChildNodeWithTagName(
								mainactor, A);
						while (null != actor) {
							String name = actor.getTextContent();
							if (null != name && name.length() > 0) {
								this.appendKeyValue(NUTCH_VIDEO_ACTOR, name);
							}
							mainactor.removeChild(actor);
							actor = this.getFirstChildNodeWithTagName(
									mainactor, A);
						}
					}

					Element li = this.getFirstDescendantWithTagPlusAttribute(
							element, LI, CLASS, H);
					if (null != li) {
						Element a = this.getFirstChildNodeWithTagName(li, A);
						if (null != a) {
							String year = a.getTextContent();
							if (null != year && year.trim().length() == 4
									&& this.isNumber(year.trim())) {
								this.appendKeyValue(this.NUTCH_VIDEO_YEAR,
										year.trim());
							}
						}

					}

					li = this.getFirstDescendantWithTagPlusAttribute(element,
							LI, ID, AREABOX);
					if (null != li) {
						Element a = this.getFirstChildNodeWithTagName(li, A);
						if (null != a) {
							String area = a.getTextContent();
							if (null != area && area.length() > 0) {
								this.appendKeyValue(NUTCH_VIDEO_AREA, area);
							}
						}
						if (null != li.getAttribute(CLASS))
							li.removeAttribute(CLASS);

					}

					li = this.getFirstDescendantWithTagPlusAttribute(element,
							LI, CLASS, S_H);
					if (null != li) {
						Element a = this.getFirstChildNodeWithTagName(li, A);
						if (null != a) {
							String type = a.getTextContent();
							if (null != type && type.length() > 0) {
								this.appendKeyValue(NUTCH_VIDEO_TYPE, type);
							}
						}
					}

					Element p = this.getFirstDescendantWithTagPlusAttribute(
							element, P, CLASS, INTRO);
					if (null != p) {
						// delete a firstly!!!
						Element em = this.getFirstChildNodeWithTagName(p, "em");
						if (null != em) {
							p.removeChild(em);
						}
						String text = p.getTextContent();
						if (null != text && text.length() > 0) {
							this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION,
									text);
						}
					}

					return;
				}
			}

			// handle children
			NodeList children = node.getChildNodes();
			for (int i = 0; children != null && i < children.getLength(); i++) {
				walk(children.item(i));
			}

		} else if (nodeType == Node.TEXT_NODE) {
			return;
		} else if (nodeType == Node.COMMENT_NODE) {
			return;
		} else {
			LOG.info("xxx-type-not-parsed------" + node.getNodeName());
			return;
		}

	}

	public ParseResult filter(Content content, ParseResult parseResult,
			HTMLMetaTags metaTags, DocumentFragment doc) {
		_parseResult = parseResult;
		_content = content;
		LOG.info("begin**********HtmlParseFilterIQiyi************by IQiyi");
		/*
		 * LOG.info("Content Information:");
		 * LOG.info("BaseUrl---"+content.getBaseUrl());
		 * LOG.info("ContentType---"+content.getContentType());
		 * LOG.info("url---"+content.getUrl());
		 * LOG.info("Content---"+content.getContent().toString());
		 * LOG.info("Metadata"+content.getMetadata().toString());
		 */
		if (null == content || null == parseResult || null == metaTags
				|| null == doc) {
			LOG.info("content|parseResult|metaTags|doc is null,so just return parseResult...");
			return parseResult;
		}
		LOG.info("four params checked ok,handle next......");
		String url = content.getUrl();
		LOG.info("currenturl is ------" + url);
		if (false == this.isValidUrl(url)) {
			LOG.info("invalid url,just return raw parseResult...");
			return parseResult;
		}
		LOG.info("video[source]---" + "valid url,iqiyistatics...");
		
		this.appendKeyValue(NUTCH_VIDEO_URL, url);

		/*
		 * Metadata metadata = metaTags.getGeneralTags(); if (null != metadata)
		 * { this.appendKeyValue(NUTCH_VIDEO_KEYWORD, metadata.get(KEYWORDS));
		 * this.appendKeyValue(NUTCH_VIDEO_TITLE, metadata.get(TITLE));
		 * this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION,
		 * metadata.get(DESCRIPTION)); }
		 */

		walk((Node) doc);

		LOG.info("end**************************************end");
		return parseResult;
	}

	public Configuration getConf() {
		return conf;
	}

	public void setConf(Configuration conf) {
		this.conf = conf;
	}

}

你可能感兴趣的:(Nutch,sohu)