HtmlParser对Visitor和Filter的方法进行了封装,定义了针对一些常用html元素操作的bean,简化对常用元素的提取操作。包括:FilterBean、HTMLLinkBean、HTMLTextBean、LinkBean、StringBean、BeanyBaby等。这里给出LinKBean和StringBean的源码:
1、LinkBean代码
import java.net.URL; import java.util.HashSet; import java.util.Set; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.beans.LinkBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class UrlFilter { //LinkBean类使用 public Set<String> LinkBean(String src,String urlFilters) throws Exception { Set<String> links = new HashSet<String>(); LinkBean linkBean = new LinkBean(); linkBean.setURL(src); URL[] urls = linkBean.getLinks(); for (int i = 0; i < urls.length; i++) { String url=urls[i].toString(); String[] UrlFilters=urlFilters.trim().split(","); for (String urlFilter : UrlFilters) if(url.startsWith(urlFilter)) links.add(url); } return links; } public static void main(String[]args) throws Exception{ UrlFilter uf=new UrlFilter(); String strUrlFilters="https://www.rizhiyi.com"; Set<String> links=uf.LinkBean("https://www.rizhiyi.com/",strUrlFilters); for(String link:links) System.out.println(link); } }
2、StringBean代码
import java.net.URL; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; import sp.SslUtils; public class UrlParser { * 根据提供的URL,获取此URL对应网页的纯文本信息 * @param url 提供的URL链接 * @return RL对应网页的纯文本信息 * @throws Exception */ public String getText(String url)throws Exception{ StringBean sb = new StringBean (); sb.setLinks (false); sb.setReplaceNonBreakingSpaces(true); sb.setCollapse(true); Parser parser = new Parser (url); parser.setEncoding("UTF-8"); parser.reset (); parser.visitAllNodesWith (sb); String text = sb.getStrings (); return text; } public static void main(String[]args) throws Exception{ UrlParser fd=new UrlParser(); try { String str=fd.getText("https://www.rizhiyi.com/"); System.out.println(str); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }