org.htmlparser是用来解析html页面的工具类
之前写了一段代码:统计html中的js引入、css引入、href标签,img标签
parser的构造函数有几个,既可以使用内容也可以使用url构造。注意要设置编码,UTF-8/GBK都行。
filter字面意思理解,用来对parser中的html进行过滤(可以按照我的代码配置多个过滤也可以配置一个)
htmlparser架包中包含一个tags package里面有部分标签,或者也可以通过名字来定义标签TagNameFilter
public FileDetailStandard getHtmlInfo(FileDetailStandard fileDetailStandard,String content) { if(StringUtils.isBlank(content)){ return fileDetailStandard; } Parser parser = Parser.createParser(content, ENCODEING); NodeClassFilter scriptFilter = new NodeClassFilter(ScriptTag.class); NodeFilter linkFilter = new TagNameFilter("link"); NodeClassFilter imageFilter = new NodeClassFilter(ImageTag.class); NodeClassFilter hrefFilter = new NodeClassFilter(LinkTag.class); int scriptCount = 0; int linkCount = 0; int imageCount = 0; int hrefCount = 0; NodeFilter[] predicates = new NodeFilter[] { scriptFilter, hrefFilter, imageFilter, linkFilter }; OrFilter orFilter = new OrFilter(predicates); try { NodeList TagList = parser.extractAllNodesThatMatch(orFilter); for (int i = 0; i < TagList.size(); i++) { TagNode tagNode = (TagNode) TagList.elementAt(i); if (tagNode instanceof ScriptTag) { String attributeValue = tagNode.getAttribute("src"); if (StringUtils.isBlank(attributeValue)) { continue; } scriptCount++; } else if (tagNode instanceof ImageTag) { String attributeValue = tagNode.getAttribute("src"); if (StringUtils.isBlank(attributeValue)) { continue; } imageCount++; } else if (tagNode instanceof LinkTag) { String attributeValue = tagNode.getAttribute("href"); //过滤不符合的信息 if (StringUtils.isBlank(attributeValue)|| attributeValue.startsWith("#") || attributeValue.equalsIgnoreCase("javascript:;")) { continue; } hrefCount++; } else if (tagNode.getTagName().equalsIgnoreCase("link")) { String attributeValue = tagNode.getAttribute("href"); if (StringUtils.isBlank(attributeValue) ) { continue; } linkCount++; } } fileDetailStandard.setRelCssCount(linkCount); fileDetailStandard.setRelHyperLinkCount(hrefCount); fileDetailStandard.setRelImageCount(imageCount); fileDetailStandard.setRelJsCount(scriptCount); } catch (ParserException e) { logger.info("htmlparser解析文件错误", e); } return fileDetailStandard; }
通过htmlparser解析相应的img标签,然后下载其中的图片到本地
import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.nodes.TagNode; import org.htmlparser.tags.ImageTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class HtmlImageUtil { public void parseHtml(String url) { if (!url.contains("http") && !url.contains("https")) { url = "http://" + url; } try { Parser parser = new Parser(url); parser.setEncoding("UTF-8"); NodeClassFilter imageFilter = new NodeClassFilter(ImageTag.class); NodeList TagList = parser.extractAllNodesThatMatch(imageFilter); for (int i = 0; i < TagList.size(); i++) { TagNode tagNode = (TagNode) TagList.elementAt(i); String attributeValue = tagNode.getAttribute("src"); if (attributeValue.contains("http") || attributeValue.contains("https")) { } else { attributeValue = url + attributeValue; } System.out.println(attributeValue); String fileName = getImageNameByUrl(attributeValue); try { download(attributeValue, fileName, "C:\\Users\\sks\\Desktop\\test\\"); } catch (Exception e) { e.printStackTrace(); } } } catch (ParserException e) { e.printStackTrace(); } } public static void download(String urlString, String filename, String savePath) throws Exception { // 构造URL URL url = new URL(urlString); // 打开连接 URLConnection con = url.openConnection(); // 设置请求超时为5s con.setConnectTimeout(5 * 1000); // 输入流 InputStream is = con.getInputStream(); // 1K的数据缓冲 byte[] bs = new byte[1024]; // 读取到的数据长度 int len; // 输出的文件流 File sf = new File(savePath); if (!sf.exists()) { sf.mkdirs(); } OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename); // 开始读取 while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } // 完毕,关闭所有链接 os.close(); is.close(); } public static String getImageNameByUrl(String url) { return url.substring(url.lastIndexOf("/") + 1, url.length()); } public static void main(String[] args) { // http://www.srkx.cn/ new HtmlImageUtil().parseHtml("http://www.srgs.net"); } }