工具包系列(2):imageSpider工具——可定制的图像抓取
这个工具是一个可定制的图像抓取工具
我希望这个小工具的功能点有以下几项:1.给定页面抓取页面的图片;2.给定页面和过滤规则,抓取页面的图片并存到本地磁盘或内存;
主要的技术点不多:1.图片链接的获取(htmlparser搞定);2.图片的读写(imageIo搞定);3.规则的制定(来源于需求)
介于方法的多样,第一个版本的spider只是很简单的功能实现,未来希望加入的就是可扩展的规则对象
少废话,上代码:
主类:
1: /**
2: *
3: */
4: package com.taobao.cd.http.image;
5:
6: import java.io.IOException;
7: import java.io.InputStream;
8: import java.net.URL;
9: import java.net.URLConnection;
10: import java.util.HashSet;
11: import java.util.Set;
12: import java.util.concurrent.ExecutorService;
13: import java.util.concurrent.Executors;
14:
15: import javax.imageio.ImageIO;
16: import javax.imageio.ImageReader;
17: import javax.imageio.stream.ImageInputStream;
18:
19: import org.htmlparser.NodeFilter;
20: import org.htmlparser.Parser;
21: import org.htmlparser.filters.TagNameFilter;
22: import org.htmlparser.tags.ImageTag;
23: import org.htmlparser.util.NodeList;
24:
25: import com.taobao.cd.http.util.HttpUtil;
26: import com.taobao.cd.http.util.ImageReaderFactory;
27: import com.taobao.cd.http.util.ParserPool;
28:
29: /**
30: * 这是一个图片抓取器,通过给定url抓取该页面的所有img 可定制,过滤 ver 1.0: 只是初级实现图片抓取
31: *
32: * @author zunyuan.jy
33: *
34: * @date 2011-11-2
35: */
36: public class ImageSpider {
37:
38: private Set<String> imgSet; // 用于记录已经下载过的图像url
39:
40: private int customedSize; // 支持定制的图像大小,单位是KB
41:
42: public ImageSpider() {
43: this(0);
44: }
45:
46: public ImageSpider(int s) {
47: this.customedSize = s;
48: imgSet = new HashSet<String>();
49: }
50:
51: /**
52: * 抓取指定url页面的所有图像数据
53: *
54: * @param url
55: * 页面url
56: * @param path
57: * 要将图片保存的路径
58: * @throws Exception
59: */
60: public void crawl(String url, final String path) throws Exception {
61: URL u = new URL(url);
62: URLConnection con = (u.openConnection());
63: con.setRequestProperty("User-Agent", HttpUtil.UA);
64: org.htmlparser.scanners.ScriptScanner.STRICT = false;
65: org.htmlparser.lexer.Lexer.STRICT_REMARKS = false;
66: Parser parser = ParserPool.getInstance().borrowOne();
67: parser.setConnection(con);
68:
69: NodeFilter filter = new TagNameFilter("img");
70: NodeList nodes = parser.extractAllNodesThatMatch(filter);
71: ImageTag node = null;
72: String imgSrc;
73: String suffix;
74: if (nodes != null) {
75: for (int i = 0; i < nodes.size(); i++) {
76: node = (ImageTag) nodes.elementAt(i);
77: imgSrc = node.getImageURL();
78:
79: if (!imgSet.contains(imgSrc)) {
80: imgSet.add(imgSrc);
81: suffix = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
82: if (suffix.equalsIgnoreCase(ImageUtil.JPG)
83: || suffix.equalsIgnoreCase(ImageUtil.PNG)
84: || suffix.equalsIgnoreCase(ImageUtil.GIF)
85: || suffix.equalsIgnoreCase(ImageUtil.BMP)) {
86: URL uu = new URL(imgSrc);
87: if (customedSize == 0 || filterSize(uu, suffix)) {
88: ImageUtil.writeImg(uu, path, suffix);
89: }
90: } else {
91: System.err.println(suffix
92: + ":img format not supported!");
93: }
94: }
95: }
96: }
97: }
98:
99: private boolean filterSize(URL u, String suffix) throws IOException {
100: InputStream is = u.openStream();
101: ImageInputStream stream = ImageIO.createImageInputStream(is);
102: ImageReader ir = ImageReaderFactory.getInstance().createImageReader(
103: suffix);
104: if (ir != null) {
105: ir.setInput(stream, true, false);
106: int w = ir.getWidth(0);
107: int h = ir.getHeight(0);
108: if (w * h < customedSize * 1024 * 3 + 100) {
109: return true;
110: } else {
111: return false;
112: }
113: } else {
114: System.err.println(u.getFile() + ":read img header error!");
115: return false;
116: }
117: }
118:
119: /**
120: * @param args
121: */
122: public static void main(String[] args) {
123: // TODO Auto-generated method stub
124:
125: }
126: }
util方法片段:
1: /**
2: * 保存图像
3: * @param u 图像的url对象
4: * @param path 图像待保存的位置
5: * @param fileSuffix 保存的图像格式
6: * @throws IOException
7: */
8: public static void writeImg(URL u, String path, String fileSuffix)
9: throws IOException {
10:
11: BufferedImage bimg = ImageIO.read(u);
12: if (bimg != null) {
13: String cd = CommonUtils.formatCurrentDate();
14: if (!path.endsWith(FILE_SEP)) {
15: path += FILE_SEP;
16: }
17: String fileName = path + cd + "_" + System.currentTimeMillis()
18: + "." + fileSuffix;
19: ImageIO.write(bimg, fileSuffix, new File(fileName));
20: } else {
21: System.err.println("read img error!");
22: }
23: }
上个工具就提到的parserpool,这个工具也会用到,所以也放上代码:
1: /**
2: *
3: */
4: package com.taobao.cd.http.util;
5:
6: import org.apache.commons.pool.ObjectPool;
7: import org.apache.commons.pool.impl.StackObjectPool;
8: import org.htmlparser.Parser;
9:
10: /**
11: * 这是一个对象池,只负责生成空的parser,并管理这些parser
12: * 当用完一个parser后就返还给对象池,同时对象池负责清空这个parser
13: * @author zunyuan.jy
14: *
15: * @date 2011-10-28
16: */
17: public class ParserPool {
18: /*singleton*/
19: private static ParserPool parserPool = new ParserPool();
20: /**/
21:
22: private ObjectPool pool;
23:
24: private ParserPool(){
25: pool = new StackObjectPool(new ParserFactory());
26: }
27: public static synchronized ParserPool getInstance() {
28: if(parserPool==null)
29: return new ParserPool();
30: return parserPool;
31: }
32: public void returnOne(Parser parser) throws Exception {
33: pool.returnObject(parser);
34: }
35:
36: public Parser borrowOne() throws Exception {
37: return (Parser) pool.borrowObject();
38: }
39:
40: public void addOne(Parser parser) throws Exception {
41: pool.addObject();
42: }
43: }
最后附上测试代码:
1: /**
2: *
3: */
4: package com.taobao.cd.http.image;
5:
6: import org.junit.Test;
7:
8: import junit.framework.TestCase;
9:
10: /**
11: * @author zunyuan.jy
12: *
13: * @date 2011-11-2
14: */
15: public class ImageSpiderTest extends TestCase {
16:
17: @Test
18: public void testCrawl() {
19: String path = "D:\\个人工具代码\\img";
20: ImageSpider is = new ImageSpider(5);
21: try {
22: long start = System.nanoTime();
23: is.crawl("http://www.163.com", path);
24: long end = System.nanoTime();
25: System.out.println("time usage:"+(end-start));
26: } catch (Exception e) {
27: e.printStackTrace();
28: }
29: }
30: }
测试结果如下:
com/ad_cookies:img format not supported!
gif?a=&c=860010-0503010000:img format not supported!
time usage:8622310262
前两行是两个出错的图片
最后一行是执行时间
加入了规则后可能时间长一点,因为有过滤规则检查。另外就是过滤的规则我现在只加了一个大小的过滤,而且大小也算的不准,后续有什么好的计算图像的大小的方法和工具欢迎推荐。我这里只是一个粗略的size*1024*3+100来计算BYTE数了~~见笑
P.S. 上面关于parser池的方法里还少一个ParserFactory的定义,这个可以看看ObjectPool的文档或者看我以前的文章就明白了,使用非常简单
P.S. 代码里的所有错误都抛了异常,或者用system.err输出了~需要的可以改进
代码已上传到git,有兴趣的可以加入coding,
[email protected]:changedi/CDLib.git这是git地址