关于我使用htmlparser,以及通过htmllparse下载网页中的图片

org.htmlparser是用来解析html页面的工具类

之前写了一段代码:统计html中的js引入、css引入、href标签,img标签

parser的构造函数有几个,既可以使用内容也可以使用url构造。注意要设置编码,UTF-8/GBK都行。

filter字面意思理解,用来对parser中的html进行过滤(可以按照我的代码配置多个过滤也可以配置一个)

htmlparser架包中包含一个tags package里面有部分标签,或者也可以通过名字来定义标签TagNameFilter

public FileDetailStandard getHtmlInfo(FileDetailStandard fileDetailStandard,String content) {
		if(StringUtils.isBlank(content)){
			return fileDetailStandard;
		}
		Parser parser = Parser.createParser(content, ENCODEING);
		NodeClassFilter scriptFilter = new NodeClassFilter(ScriptTag.class);
		NodeFilter linkFilter = new TagNameFilter("link");
		NodeClassFilter imageFilter = new NodeClassFilter(ImageTag.class);
		NodeClassFilter hrefFilter = new NodeClassFilter(LinkTag.class);

		int scriptCount = 0;
		int linkCount = 0;
		int imageCount = 0;
		int hrefCount = 0;

		NodeFilter[] predicates = new NodeFilter[] { scriptFilter, hrefFilter,
				imageFilter, linkFilter };
		OrFilter orFilter = new OrFilter(predicates);

		try {
			NodeList TagList = parser.extractAllNodesThatMatch(orFilter);
			for (int i = 0; i < TagList.size(); i++) {
				TagNode tagNode = (TagNode) TagList.elementAt(i);

				if (tagNode instanceof ScriptTag) {
					String attributeValue = tagNode.getAttribute("src");
					if (StringUtils.isBlank(attributeValue)) {
						continue;
					}
					scriptCount++;
				} else if (tagNode instanceof ImageTag) {
					String attributeValue = tagNode.getAttribute("src");
					if (StringUtils.isBlank(attributeValue)) {
						continue;
					}
					imageCount++;
				} else if (tagNode instanceof LinkTag) {
					String attributeValue = tagNode.getAttribute("href");
					//过滤不符合的信息
					if (StringUtils.isBlank(attributeValue)|| attributeValue.startsWith("#") || attributeValue.equalsIgnoreCase("javascript:;")) {
						continue;
					}
					hrefCount++;
				} else if (tagNode.getTagName().equalsIgnoreCase("link")) {
					String attributeValue = tagNode.getAttribute("href");
					if (StringUtils.isBlank(attributeValue) ) {
						continue;
					}
					linkCount++;
				}
			}
			fileDetailStandard.setRelCssCount(linkCount);
			fileDetailStandard.setRelHyperLinkCount(hrefCount);
			fileDetailStandard.setRelImageCount(imageCount);
			fileDetailStandard.setRelJsCount(scriptCount);

		} catch (ParserException e) {
			logger.info("htmlparser解析文件错误", e);
		}

		return fileDetailStandard;
	}


通过htmlparser解析相应的img标签,然后下载其中的图片到本地

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;

import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class HtmlImageUtil {
	public void parseHtml(String url) {
		if (!url.contains("http") && !url.contains("https")) {
			url = "http://" + url;
		}

		try {
			Parser parser = new Parser(url);
			parser.setEncoding("UTF-8");
			NodeClassFilter imageFilter = new NodeClassFilter(ImageTag.class);
			NodeList TagList = parser.extractAllNodesThatMatch(imageFilter);
			for (int i = 0; i < TagList.size(); i++) {
				TagNode tagNode = (TagNode) TagList.elementAt(i);
				String attributeValue = tagNode.getAttribute("src");
				if (attributeValue.contains("http") || attributeValue.contains("https")) {
				} else {
					attributeValue = url + attributeValue;
				}
				System.out.println(attributeValue);
				String fileName = getImageNameByUrl(attributeValue);
				try {
					download(attributeValue, fileName, "C:\\Users\\sks\\Desktop\\test\\");
				} catch (Exception e) {
					e.printStackTrace();
				}

			}

		} catch (ParserException e) {
			e.printStackTrace();
		}
	}

	public static void download(String urlString, String filename, String savePath) throws Exception {
		// 构造URL
		URL url = new URL(urlString);
		// 打开连接
		URLConnection con = url.openConnection();
		// 设置请求超时为5s
		con.setConnectTimeout(5 * 1000);
		// 输入流
		InputStream is = con.getInputStream();

		// 1K的数据缓冲
		byte[] bs = new byte[1024];
		// 读取到的数据长度
		int len;
		// 输出的文件流
		File sf = new File(savePath);
		if (!sf.exists()) {
			sf.mkdirs();
		}
		OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
		// 开始读取
		while ((len = is.read(bs)) != -1) {
			os.write(bs, 0, len);
		}
		// 完毕,关闭所有链接
		os.close();
		is.close();
	}

	public static String getImageNameByUrl(String url) {
		return url.substring(url.lastIndexOf("/") + 1, url.length());
	}

	public static void main(String[] args) {
		// http://www.srkx.cn/
		new HtmlImageUtil().parseHtml("http://www.srgs.net");

	}
}




你可能感兴趣的:(java,http)