HTMLParser获取一個网站连接的URL

HTMLParser用于解析html文件同时可以获取URL。

代码如下:

package cn.com.vnvtrip.apache.luence.custom;
/**
 *
 * @author longgangbai
 *
 */
public interface Constants {
 public static final String HTTP_URL_TARGET_TAG = "target";
 public static final String HTTP_URL_ONCLICK_TAG = "onclick";
 public static final String HTTP_UBN_TAG = "bnu";
 public static final String HTTP_TAG = "http";
}

 

package cn.com.vnvtrip.apache.luence.custom;

import java.util.ArrayList;
import java.util.Collection;

import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
 * 根据一個根网站抓取相应URL
 * @author longgangbai
 *
 */
public class HTMLParser {
 /**
  * 搜索网站的检索URl
  *
  * @param url
  * @param pageEncoding
  * @return
  * @throws ParserException
  */
 public static Collection<String> getWebSiteUrls(String url,
   String pageEncoding) throws ParserException {
  Collection<String> urls = new ArrayList<String>();
  Parser parser = new Parser(url);
  parser.setEncoding(pageEncoding);
  NodeList nodeList = parser.parse(new AndFilter(new HasAttributeFilter(
    Constants.HTTP_URL_TARGET_TAG), new HasAttributeFilter(
    Constants.HTTP_URL_ONCLICK_TAG)));
  if (nodeList != null && nodeList.size() > 0) {
   for (int i = 0; i < nodeList.size(); i++) {
    String urlLink = ((LinkTag) nodeList.elementAt(i))
      .extractLink();
    String LinkName = ((LinkTag) nodeList.elementAt(i))
      .getLinkText();
    if (urlLink.indexOf(Constants.HTTP_UBN_TAG) == 0
      || urlLink.indexOf(Constants.HTTP_TAG) == 0) {
     urls.add(LinkName);
    } else {
     urls.add(urlLink);
    }
   }
  }
  return urls;
 }
}

你可能感兴趣的:(apache,html)