1 进入到文件夹 $NUTCH_HOME/src/plugin
mkdir htmlparsefilter-sohu
2 按照下面的结构建立目录和文件
htmlparsefilter-sohu/
plugin.xml
build.xml
ivy.xml
src/
java/
org/
apache/
nutch/
parse/
HtmlParseFilterSohu.java
3 修改plugin.xml
<?xml version="1.0" encoding="UTF-8"?>
<plugin id="htmlparsefilter-sohu" name="Add Sohu Field to Doc"
version="1.0.0" provider-name="nutch.org">
<runtime>
<library name="htmlparsefilter-sohu.jar">
<export name="*"/>
</library>
</runtime>
<requires>
</requires>
<extension id="org.apache.nutch.parse.parse_sohu"
name="Add sohu Field to doc"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="HtmlParseFilterSohu"
class="org.apache.nutch.parse.HtmlParseFilterSohu"/>
</extension>
</plugin>
4 关于ivy.xml
从plugin/index-basic下面复制对应的ivy.xml,不需要任何改变即可。
5 关于build.xml
修改成以下内容
<?xml version="1.0" encoding="UTF-8"?>
<project name="htmlparsefilter-sohu" default="jar">
<import file="../build-plugin.xml"/>
</project>
6 修改 HtmlParseFilterSohu.java-以实际需求为准-以实际代码为准
7 修改src/plugin/build.xml
找到
<!-- ====================================================== -->
<!-- Build & deploy all the plugin jars. -->
<!-- ====================================================== -->
在下面添加一行
<ant dir="htmlparsefilter-sohu" target="deploy"/>
8 修改nutch-site.xml
我的运行在local模式下,则修改配置文件local/conf/nutch-site.xml如下
从nutch-default.xml中复制plugin.includes的配置块
<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description> </description>
</property>
到nutch-site.xml中
然后修改复制后的内容
<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|htmlparsefilter-sohu</value>
<description> </description>
</property>
就可以了。
9 最后一步-修改$NUTCH_HOME/conf/schema.xml
在<fields>...</fields>段内添加
<field name="pageLength"type="long"stored="true"indexed="true"/>
10 重新ant , 大功告成。
java文件如下:
package org.apache.nutch.parse; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.protocol.Content; import org.w3c.dom.Comment; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; public class HtmlParseFilterSohu implements HtmlParseFilter { private static final Log LOG = LogFactory.getLog(HtmlParseFilterSohu.class); private Configuration conf; private ParseResult _parseResult; private Content _content; private static final String A = "a"; private static final String ACTOR = "actor"; private static final String ALBUM = "album"; private static final String AREABOX = "areabox"; private static final String CATEGORY = "category"; private static final String CLASS = "class"; private static final String CONTENT = "content"; private static final String CONTENT_LOCATION = "contentLocation"; private static final String CRUMBS = "crumbs"; private static final String CRUMBSBAR = "crumbsBar"; private static final String DATA_SUBSCRIBE_CATEGORYNAME = "data-subscribe-categoryname"; private static final String DATE_PUBLISHED = "datePublished"; private static final String DESCRIPTION = "description"; private static final String DIRECTOR = "director"; private static final String DIV = "div"; private static final String DURATION = "duration"; private static final String FULLDESCRIPTION = "full_desc"; private static final String GENRE = "genre"; private static final String H = "h"; private static final String H2 = "h2"; private static final String ID = "id"; private static final String INTRO = "intro"; private static final String INFO_INFO_CON = "info info-con"; private static final String IRALBUMNAME = "irAlbumName"; private static final String IRCATEGORY = "irCategory"; private static final String IRTITLE = "irTitle"; private static final String ITEM = "item"; private static final String ITEMPROP = "itemprop"; private static final String KEYWORDS = "keywords"; private static final String LABEL = "label"; private static final String LI = "li"; private static final String LINK = "link"; private static final String MAINACTOR = "mainactor"; private static final String NAME = "name"; private static final String OG_IMAGE = "og:image"; private static final String P = "p"; private static final String PEOS_INFO = "peos-info"; private static final String PROPERTY = "property"; private static final String PUB = "pub"; private static final String S_H = "s h"; private static final String SCRIPT = "script"; private static final String SERIES = "series"; private static final String SPAN = "span"; private static final String STYLE = "style"; private static final String THUMB_NAIL_URL = "thumbnailUrl"; private static final String TITLE = "title"; private static final String TYPE = "type"; private static final String MVINFO = "mvInfo"; // just a mv private static final String META = "meta"; private static final String UL = "ul"; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~url about private static final String URL_DOMAIN = ".sohu.com/"; private static final String URL_SUFFIX = ".shtml"; private static final String URL_SLASH = "/"; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~parse_data public static final String NUTCH_VIDEO_FULL_DESCRIPTION = "desc"; public static final String NUTCH_VIDEO_SERIES = "series"; public static final String NUTCH_VIDEO_YEAR = "year"; public static final String NUTCH_VIDEO_AREA = "area"; public static final String NUTCH_VIDEO_TYPE = "type"; public static final String NUTCH_VIDEO_DIRECTOR = "director"; public static final String NUTCH_VIDEO_ACTOR = "actor"; public static final String NUTCH_VIDEO_TITLE = "title"; public static final String NUTCH_VIDEO_CHANNEL = "channel"; public static final String NUTCH_VIDEO_KEYWORD = "keyword"; public static final String NUTCH_VIDEO_URL = "url"; public static final String NUTCH_VIDEO_PICTURE = "picture"; public static final String NUTCH_VIDEO_WEBSITE = "website"; public static final String NUTCH_VIDEO_TIMESPAN = "timespan"; private boolean isNumber(String segment) { int length = segment.length(); int index = 0; for (index = 0; index < length; index++) { char c = segment.charAt(index); if (c >= '0' && c <= '9') { } else { break; } } if (index >= length) { return true; } else { return false; } } private boolean isValidStr(String segment) { int length = segment.length(); int index = 0; for (; index < length; index++) { char c = segment.charAt(index); if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { } else { break; } } if (index >= length) { return true; } else { return false; } } private boolean isValidUrl(String url) { if (null == url) { return false; } int firstIndex, secondIndex, thirdIndex; firstIndex = url.indexOf(URL_DOMAIN); if (-1 == firstIndex) { return false; } firstIndex += URL_DOMAIN.length(); secondIndex = url.indexOf(URL_SLASH, firstIndex); if (-1 == secondIndex) { return false; } int length = secondIndex - firstIndex; if (8 != length) { return false; } String str = url.substring(firstIndex, secondIndex); if (false == this.isNumber(str)) { return false; } secondIndex += URL_SLASH.length(); thirdIndex = url.indexOf(URL_SUFFIX, secondIndex); if (-1 == thirdIndex) { return false; } str = url.substring(secondIndex, thirdIndex); if (false == this.isValidStr(str)) { return false; } return true; } private String getKeyValue(String key) { if (null == _parseResult || null == _content || null == _content.getUrl()) { return null; } ParseData parseData = _parseResult.get(_content.getUrl()).getData(); if (null == parseData) { return null; } Metadata metadata = parseData.getParseMeta(); if (null == metadata) { return null; } return metadata.get(key); } /* * private void setKeyValue(String key, String value) { if (null == * _parseResult || null == _content || null == _content.getUrl()) { return; * } * * ParseData parseData = _parseResult.get(_content.getUrl()).getData(); if * (null == parseData) { return; } Metadata metadata = * parseData.getParseMeta(); if (null == metadata) { return; } * metadata.set(key, value); } */ private void appendKeyValue(String key, String value) { // used when exist more block... if (null == _parseResult || null == _content || null == _content.getUrl()) { return; } ParseData parseData = _parseResult.get(_content.getUrl()).getData(); if (null == parseData) { return; } Metadata metadata = parseData.getParseMeta(); if (null == metadata) { return; } String oldValue = metadata.get(key); if (null == oldValue) { metadata.set(key, value); } else { metadata.set(key, oldValue + " " + value); } LOG.info("[" + key + "] [" + metadata.get(key) + "]"); } private String replaceWithRegex(String str, String regEx, String replace) { Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); str = m.replaceAll(replace).trim(); return str; } private Element getFirstChildNodeWithTagName(Element parent, String tagName) { Element result = null; if (null == parent) { return null; } // not include itself... NodeList nodeList = parent.getChildNodes(); int index = 0; for (index = 0; index < nodeList.getLength(); index++) { Node child = nodeList.item(index); if (child instanceof Element) { Element childElement = (Element) child; String childTagName = childElement.getTagName(); if (null != childTagName) { childTagName = childTagName.toLowerCase().trim(); if (childTagName.equals(tagName)) { return childElement; } } } } return null; } private Element getFirstChildNodeWithTagNameFixedAttributeLength( Element parent, String tagName, int n) { Element result = null; if (null == parent) { return null; } // not include itself... NodeList nodeList = parent.getChildNodes(); int index = 0; for (index = 0; index < nodeList.getLength(); index++) { Node child = nodeList.item(index); if (child instanceof Element) { Element childElement = (Element) child; String childTagName = childElement.getTagName(); if (null != childTagName) { childTagName = childTagName.toLowerCase().trim(); if (childTagName.equals(tagName)) { if (childElement.getAttributes().getLength() == n) return childElement; } } } } return null; } private Element getFirstChildNodeWithAttribute(Element parent, String attributeName, String attributeValue) { Element result = null; if (null == parent) { return null; } // not include itself... NodeList nodeList = parent.getChildNodes(); int index = 0; for (index = 0; index < nodeList.getLength(); index++) { Node child = nodeList.item(index); if (child instanceof Element) { Element childElement = (Element) child; String value = childElement.getAttribute(attributeName); if (null != value) { value = value.trim(); if (value.equals(attributeValue)) { return childElement; } } } } return null; } private Element getFirstDescendantWithAttribute(Node parent, String attributeName, String attributeValue) { // include itself... Element result = null; if (null == parent) { return null; } if (parent instanceof Element) { Element element = (Element) parent; String value = element.getAttribute(attributeName); if (null != value && value.equals(attributeValue)) { return element; } } // find from all child... NodeList children = parent.getChildNodes(); int index = 0; for (index = 0; index < children.getLength(); index++) { Node child = children.item(index); Element found = getFirstDescendantWithAttribute(child, attributeName, attributeValue); if (null != found) { return found; } } // not find... return null; } private Element getFirstDescendantWithTag(Node parent, String tagName) { // include itself... Element result = null; if (null == parent) { return null; } if (parent instanceof Element) { Element element = (Element) parent; String value = element.getTagName(); if (null != value && value.toLowerCase().equals(tagName)) { return element; } } // find from all child... NodeList children = parent.getChildNodes(); int index = 0; for (index = 0; index < children.getLength(); index++) { Node child = children.item(index); Element found = getFirstDescendantWithTag(child, tagName); if (null != found) { return found; } } // not find... return null; } private Element getFirstDescendantWithTagPlusAttribute(Node parent, String tagName, String attributeName, String attributeValue) { // include itself... Element result = null; if (null == parent) { return null; } if (parent instanceof Element) { Element element = (Element) parent; String tag = element.getTagName(); if (null != tag && tag.toLowerCase().equals(tagName)) { String attrValue = element.getAttribute(attributeName); if (null != attrValue && attrValue.equals(attributeValue)) { return element; } } } // find from all child... NodeList children = parent.getChildNodes(); int index = 0; for (index = 0; index < children.getLength(); index++) { Node child = children.item(index); Element found = getFirstDescendantWithTagPlusAttribute(child, tagName, attributeName, attributeValue); if (null != found) { return found; } } // not find... return null; } private Element getFirstDescendantWithTagPlusAttributeFixedAttribute( Node parent, String tagName, String attributeName, String attributeValue, int n) { // include itself... Element result = null; if (null == parent) { return null; } if (parent instanceof Element) { Element element = (Element) parent; String tag = element.getTagName(); if (null != tag && tag.toLowerCase().equals(tagName)) { String attrValue = element.getAttribute(attributeName); if (null != attrValue && attrValue.equals(attributeValue)) { if (null == element.getAttributes()) { if (element.getAttributes().getLength() == n) return element; } } } } // find from all child... NodeList children = parent.getChildNodes(); int index = 0; for (index = 0; index < children.getLength(); index++) { Node child = children.item(index); Element found = getFirstDescendantWithTagPlusAttribute(child, tagName, attributeName, attributeValue); if (null != found) { return found; } } // not find... return null; } private void walk(Node node) { if (null == node) { return; } short nodeType = node.getNodeType(); if (nodeType == Node.DOCUMENT_FRAGMENT_NODE) { NodeList children = node.getChildNodes(); for (int i = 0; children != null && i < children.getLength(); i++) { walk(children.item(i)); } } else if (nodeType == Node.ELEMENT_NODE) { Element element = (Element) node; String tag = element.getTagName(); if (null == tag) { return; } tag = tag.toLowerCase(); if (SCRIPT.equals(tag) || STYLE.equals(tag) || LINK.equals(tag)) { // <script>...</script> // <style>...</style> // <link>...</link> return; } else if (META.equals(tag)) { String value = element.getAttribute(NAME); if (null != value && value.equals(ALBUM)) { // album value = element.getAttribute(CONTENT); if (null != value && value.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_SERIES, value); } } else if (null != value && value.equals(CATEGORY)) { // category value = element.getAttribute(CONTENT); if (null != value && value.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_CHANNEL, value); } } value = element.getAttribute(PROPERTY); // image if (null != value && value.equals(OG_IMAGE)) { value = element.getAttribute(CONTENT); if (null != value && value.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_PICTURE, value); } } } else if (DIV.equals(tag)) { String value = element.getAttribute(ID); if (null != value && value.equals(CRUMBSBAR)) { Element h2 = this.getFirstDescendantWithTag(element, H2); if (null != h2) { String text = h2.getTextContent(); if (null != text && text.length() > 0) { text = this.replaceWithRegex(text, "[\\s\\t/()]+", " "); text = text.trim(); if (text.length() > 0) this.appendKeyValue(NUTCH_VIDEO_TITLE, text); } } return; } // /////////////////////////////////////////////////////////////// value = element.getAttribute(CLASS); if (null != value && value.equals(INFO_INFO_CON)) { Element mainactor = this .getFirstDescendantWithTagPlusAttribute(element, LI, ID, MAINACTOR); if (null != mainactor) { Element actor = this.getFirstChildNodeWithTagName( mainactor, A); while (null != actor) { String name = actor.getTextContent(); if (null != name && name.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_ACTOR, name); } mainactor.removeChild(actor); actor = this.getFirstChildNodeWithTagName( mainactor, A); } } Element li = this.getFirstDescendantWithTagPlusAttribute( element, LI, CLASS, H); if (null != li) { Element a = this.getFirstChildNodeWithTagName(li, A); if (null != a) { String year = a.getTextContent(); if (null != year && year.trim().length() == 4 && this.isNumber(year.trim())) { this.appendKeyValue(this.NUTCH_VIDEO_YEAR, year.trim()); } } } li = this.getFirstDescendantWithTagPlusAttribute(element, LI, ID, AREABOX); if (null != li) { Element a = this.getFirstChildNodeWithTagName(li, A); if (null != a) { String area = a.getTextContent(); if (null != area && area.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_AREA, area); } } if (null != li.getAttribute(CLASS)) li.removeAttribute(CLASS); } li = this.getFirstDescendantWithTagPlusAttribute(element, LI, CLASS, S_H); if (null != li) { Element a = this.getFirstChildNodeWithTagName(li, A); if (null != a) { String type = a.getTextContent(); if (null != type && type.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_TYPE, type); } } } Element p = this.getFirstDescendantWithTagPlusAttribute( element, P, CLASS, INTRO); if (null != p) { // delete a firstly!!! Element em = this.getFirstChildNodeWithTagName(p, "em"); if (null != em) { p.removeChild(em); } String text = p.getTextContent(); if (null != text && text.length() > 0) { this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION, text); } } return; } } // handle children NodeList children = node.getChildNodes(); for (int i = 0; children != null && i < children.getLength(); i++) { walk(children.item(i)); } } else if (nodeType == Node.TEXT_NODE) { return; } else if (nodeType == Node.COMMENT_NODE) { return; } else { LOG.info("xxx-type-not-parsed------" + node.getNodeName()); return; } } public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { _parseResult = parseResult; _content = content; LOG.info("begin**********HtmlParseFilterIQiyi************by IQiyi"); /* * LOG.info("Content Information:"); * LOG.info("BaseUrl---"+content.getBaseUrl()); * LOG.info("ContentType---"+content.getContentType()); * LOG.info("url---"+content.getUrl()); * LOG.info("Content---"+content.getContent().toString()); * LOG.info("Metadata"+content.getMetadata().toString()); */ if (null == content || null == parseResult || null == metaTags || null == doc) { LOG.info("content|parseResult|metaTags|doc is null,so just return parseResult..."); return parseResult; } LOG.info("four params checked ok,handle next......"); String url = content.getUrl(); LOG.info("currenturl is ------" + url); if (false == this.isValidUrl(url)) { LOG.info("invalid url,just return raw parseResult..."); return parseResult; } LOG.info("video[source]---" + "valid url,iqiyistatics..."); this.appendKeyValue(NUTCH_VIDEO_URL, url); /* * Metadata metadata = metaTags.getGeneralTags(); if (null != metadata) * { this.appendKeyValue(NUTCH_VIDEO_KEYWORD, metadata.get(KEYWORDS)); * this.appendKeyValue(NUTCH_VIDEO_TITLE, metadata.get(TITLE)); * this.appendKeyValue(NUTCH_VIDEO_FULL_DESCRIPTION, * metadata.get(DESCRIPTION)); } */ walk((Node) doc); LOG.info("end**************************************end"); return parseResult; } public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; } }