nutch从网页中提取字段并索引_HtmlParseFilter

package org.apache.nutch.htmlfilter.my;



import java.util.regex.*;



import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.crawl.Crawl;

import org.apache.nutch.metadata.Metadata;

import org.apache.nutch.parse.HTMLMetaTags;

import org.apache.nutch.parse.HtmlParseFilter;

import org.apache.nutch.parse.Parse;

import org.apache.nutch.parse.ParseResult;

import org.apache.nutch.protocol.Content;

import org.w3c.dom.DocumentFragment;



public class MyHtmlParseFilter implements HtmlParseFilter {



    public static final Log LOG = LogFactory.getLog(MyHtmlParseFilter.class);

    

    private Configuration conf;



    private Pattern p_p_title = Pattern

            .compile("<span .+class=\"b14c\">(.*?)</span>");



    private Pattern p_p_article = Pattern

            .compile("<td .*class=\"h14\".*>([\\s\\S]+?)</td>");



    private Pattern p_p_pubdate = Pattern

            .compile("<font class=\"h12\">发布时间:(.*)</font>");



    public ParseResult filter(Content content, ParseResult parseResult,

            HTMLMetaTags metaTags, DocumentFragment doc) {

        Parse parse = parseResult.get(content.getUrl());

        Metadata md = parse.getData().getParseMeta();



        try {

            // 抽取字段 正文信息示例

            String html = new String(content.getContent());



            String title = extract(html, p_p_title);

            String article = extract(html, p_p_article);

            String site = "中国公路信息网|行业动态|新通车信息";

            String pubdate_1 = extract(html, p_p_pubdate);

            String pubdate = pubdate_1.replace('年', '-').replace('月', '-')

                    .replace("日", "");

            String refurl = null;

            String cate = "1234567";



            md.add("p_title", title);

            md.add("p_article", article);

            md.add("p_site", site);

            md.add("p_pubdate", pubdate);

            md.add("p_refurl", refurl);

            md.add("p_cate", cate);

        } catch (Exception e) {

            LOG.info(e.getMessage());

        }



        return parseResult;

    }



    private String extract(String html, Pattern p) {

        Matcher match = p.matcher(html);

        String val = null;

        while (match.find()) {

            val = match.group(1);

            if (val != null) {

                val = val.trim();

            }

        }

        return val;

    }



    public Configuration getConf() {

        return this.conf;

    }



    public void setConf(Configuration conf) {

        this.conf = conf;

    }



}

你可能感兴趣的:(filter)