工具包系列(2)：imageSpider工具—

工具包系列(2)：imageSpider工具——可定制的图像抓取

这个工具是一个可定制的图像抓取工具

我希望这个小工具的功能点有以下几项：1.给定页面抓取页面的图片；2.给定页面和过滤规则，抓取页面的图片并存到本地磁盘或内存；

主要的技术点不多：1.图片链接的获取（htmlparser搞定）；2.图片的读写（imageIo搞定）；3.规则的制定（来源于需求）

介于方法的多样，第一个版本的spider只是很简单的功能实现，未来希望加入的就是可扩展的规则对象

少废话，上代码：

主类：

   1: /**

   2:  *

   3:  */

   4: package com.taobao.cd.http.image;

5:

   6: import java.io.IOException;

   7: import java.io.InputStream;

   8: import java.net.URL;

   9: import java.net.URLConnection;

  10: import java.util.HashSet;

  11: import java.util.Set;

  12: import java.util.concurrent.ExecutorService;

  13: import java.util.concurrent.Executors;

14:

  15: import javax.imageio.ImageIO;

  16: import javax.imageio.ImageReader;

  17: import javax.imageio.stream.ImageInputStream;

18:

  19: import org.htmlparser.NodeFilter;

  20: import org.htmlparser.Parser;

  21: import org.htmlparser.filters.TagNameFilter;

  22: import org.htmlparser.tags.ImageTag;

  23: import org.htmlparser.util.NodeList;

24:

  25: import com.taobao.cd.http.util.HttpUtil;

  26: import com.taobao.cd.http.util.ImageReaderFactory;

  27: import com.taobao.cd.http.util.ParserPool;

28:

  29: /**

  30:  * 这是一个图片抓取器，通过给定url抓取该页面的所有img 可定制，过滤 ver 1.0: 只是初级实现图片抓取

  31:  *

  32:  * @author zunyuan.jy

  33:  *

  34:  * @date 2011-11-2

  35:  */

  36: public class ImageSpider {

37:

  38:     private Set<String> imgSet; // 用于记录已经下载过的图像url

39:

  40:     private int customedSize; // 支持定制的图像大小,单位是KB

41:

  42:     public ImageSpider() {

  43:         this(0);

  44:     }

45:

  46:     public ImageSpider(int s) {

  47:         this.customedSize = s;

  48:         imgSet = new HashSet<String>();

  49:     }

50:

  51:     /**

  52:      * 抓取指定url页面的所有图像数据

  53:      *

  54:      * @param url

  55:      *            页面url

  56:      * @param path

  57:      *            要将图片保存的路径

  58:      * @throws Exception

  59:      */

  60:     public void crawl(String url, final String path) throws Exception {

  61:         URL u = new URL(url);

  62:         URLConnection con = (u.openConnection());

  63:         con.setRequestProperty("User-Agent", HttpUtil.UA);

  64:         org.htmlparser.scanners.ScriptScanner.STRICT = false;

  65:         org.htmlparser.lexer.Lexer.STRICT_REMARKS = false;

  66:         Parser parser = ParserPool.getInstance().borrowOne();

  67:         parser.setConnection(con);

68:

  69:         NodeFilter filter = new TagNameFilter("img");

  70:         NodeList nodes = parser.extractAllNodesThatMatch(filter);

  71:         ImageTag node = null;

  72:         String imgSrc;

  73:         String suffix;

  74:         if (nodes != null) {

  75:             for (int i = 0; i < nodes.size(); i++) {

  76:                 node = (ImageTag) nodes.elementAt(i);

  77:                 imgSrc = node.getImageURL();

78:

  79:                 if (!imgSet.contains(imgSrc)) {

  80:                     imgSet.add(imgSrc);

  81:                     suffix = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);

  82:                     if (suffix.equalsIgnoreCase(ImageUtil.JPG)

  83:                             || suffix.equalsIgnoreCase(ImageUtil.PNG)

  84:                             || suffix.equalsIgnoreCase(ImageUtil.GIF)

  85:                             || suffix.equalsIgnoreCase(ImageUtil.BMP)) {

  86:                         URL uu = new URL(imgSrc);

  87:                         if (customedSize == 0 || filterSize(uu, suffix)) {

  88:                             ImageUtil.writeImg(uu, path, suffix);

  89:                         }

  90:                     } else {

  91:                         System.err.println(suffix

  92:                                 + ":img format not supported!");

  93:                     }

  94:                 }

  95:             }

  96:         }

  97:     }

98:

  99:     private boolean filterSize(URL u, String suffix) throws IOException {

 100:         InputStream is = u.openStream();

 101:         ImageInputStream stream = ImageIO.createImageInputStream(is);

 102:         ImageReader ir = ImageReaderFactory.getInstance().createImageReader(

 103:                 suffix);

 104:         if (ir != null) {

 105:             ir.setInput(stream, true, false);

 106:             int w = ir.getWidth(0);

 107:             int h = ir.getHeight(0);

 108:             if (w * h < customedSize * 1024 * 3 + 100) {

 109:                 return true;

 110:             } else {

 111:                 return false;

 112:             }

 113:         } else {

 114:             System.err.println(u.getFile() + ":read img header error!");

 115:             return false;

 116:         }

 117:     }

 118:

 119:     /**

 120:      * @param args

 121:      */

 122:     public static void main(String[] args) {

 123:         // TODO Auto-generated method stub

 124:

 125:     }

 126: }

util方法片段：

   1: /**

   2:      * 保存图像

   3:      * @param u        图像的url对象

   4:      * @param path    图像待保存的位置

   5:      * @param fileSuffix    保存的图像格式

   6:      * @throws IOException

   7:      */

   8:     public static void writeImg(URL u, String path, String fileSuffix)

   9:             throws IOException {

10:

  11:         BufferedImage bimg = ImageIO.read(u);

  12:         if (bimg != null) {

  13:             String cd = CommonUtils.formatCurrentDate();

  14:             if (!path.endsWith(FILE_SEP)) {

  15:                 path += FILE_SEP;

  16:             }

  17:             String fileName = path + cd + "_" + System.currentTimeMillis()

  18:                     + "." + fileSuffix;

  19:             ImageIO.write(bimg, fileSuffix, new File(fileName));

  20:         } else {

  21:             System.err.println("read img error!");

  22:         }

  23:     }

上个工具就提到的parserpool，这个工具也会用到，所以也放上代码：

   1: /**

   2:  *

   3:  */

   4: package com.taobao.cd.http.util;

5:

   6: import org.apache.commons.pool.ObjectPool;

   7: import org.apache.commons.pool.impl.StackObjectPool;

   8: import org.htmlparser.Parser;

9:

  10: /**

  11:  * 这是一个对象池，只负责生成空的parser，并管理这些parser

  12:  * 当用完一个parser后就返还给对象池，同时对象池负责清空这个parser

  13:  * @author zunyuan.jy

  14:  *

  15:  * @date 2011-10-28

  16:  */

  17: public class ParserPool {

  18:     /*singleton*/

  19:     private static ParserPool parserPool = new ParserPool();

  20:     /**/

21:

  22:     private ObjectPool pool;

23:

  24:     private ParserPool(){

  25:         pool = new StackObjectPool(new ParserFactory());

  26:     }

  27:     public static synchronized ParserPool getInstance() {

  28:         if(parserPool==null)

  29:             return new ParserPool();

  30:         return parserPool;

  31:     }

  32:     public void returnOne(Parser parser) throws Exception {

  33:         pool.returnObject(parser);

  34:     }

35:

  36:     public Parser borrowOne() throws Exception {

  37:         return (Parser) pool.borrowObject();

  38:     }

39:

  40:     public void addOne(Parser parser) throws Exception {

  41:         pool.addObject();

  42:     }

  43: }

最后附上测试代码：

   1: /**

   2:  *

   3:  */

   4: package com.taobao.cd.http.image;

5:

   6: import org.junit.Test;

7:

   8: import junit.framework.TestCase;

9:

  10: /**

  11:  * @author zunyuan.jy

  12:  *

  13:  * @date 2011-11-2

  14:  */

  15: public class ImageSpiderTest extends TestCase {

16:

  17:     @Test

  18:     public void testCrawl() {

  19:         String path = "D:\\个人工具代码\\img";

  20:         ImageSpider is = new ImageSpider(5);

  21:         try {

  22:             long start = System.nanoTime();

  23:             is.crawl("http://www.163.com", path);

  24:             long end = System.nanoTime();

  25:             System.out.println("time usage:"+(end-start));

  26:         } catch (Exception e) {

  27:             e.printStackTrace();

  28:         }

  29:     }

  30: }

测试结果如下：

com/ad_cookies:img format not supported!
gif?a=&c=860010-0503010000:img format not supported!
time usage:8622310262

前两行是两个出错的图片

最后一行是执行时间

加入了规则后可能时间长一点，因为有过滤规则检查。另外就是过滤的规则我现在只加了一个大小的过滤，而且大小也算的不准，后续有什么好的计算图像的大小的方法和工具欢迎推荐。我这里只是一个粗略的size*1024*3+100来计算BYTE数了~~见笑

P.S. 上面关于parser池的方法里还少一个ParserFactory的定义，这个可以看看ObjectPool的文档或者看我以前的文章就明白了，使用非常简单

P.S. 代码里的所有错误都抛了异常，或者用system.err输出了~需要的可以改进

代码已上传到git，有兴趣的可以加入coding，

[email protected]:changedi/CDLib.git这是git地址

工具包系列(2)：imageSpider工具——可定制的图像抓取

工具包系列(2)：imageSpider工具——可定制的图像抓取

你可能感兴趣的:(工具包系列(2)：imageSpider工具——可定制的图像抓取)