Java开源爬虫框架WebCollector图片抓取教程

网站中的图片和网页在本质上是相同的，图片和网页的获取本质上都是根据URL从网站中获取网页/图片的字节数组(byte[])，浏览器会根据http响应头中的content-type信息来决定以网页还是图片的形式来展示资源。

爬取图的效果图如下:

实现的代码如下:

package imageDownload;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.util.concurrent.atomic.AtomicInteger;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;

import cn.edu.hfut.dmic.webcollector.model.Page;

import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

import cn.edu.hfut.dmic.webcollector.util.Config;

import cn.edu.hfut.dmic.webcollector.util.FileUtils;

public class DemoImageCrawler extends BreadthCrawler {

// 下载路径

File downloadDir;

// 用于生成图片名称的数字

AtomicInteger imageId;

/**

* 爬行图片

* @param crawlPath

* 爬行的路径

* @param autoParse

* 解析

public DemoImageCrawler(String crawlPath, String downloadPath) {

super(crawlPath, true);

// 创建一个文件

downloadDir = new File(downloadPath);

// 判断是否存在,如果不存在就,执行mkdirs方法

if (!downloadDir.exists()) {

downloadDir.mkdirs();

}

computeImageId();

}

/**

* 访问 Page 路径 CrawlDatums 爬虫数据

@Override

public void visit(Page page, CrawlDatums next) {

// 根据http来判断当前的资源是图片还是html

String contentType = page.getResponse().getContentType();

// 在判断当前的资源是否为Html

if (contentType == null) {

return;

} else if (contentType.contains("html")) {

// 如果有图片的,我就拿图片那部分保存在文件中

Elements imgs = page.select("img[src]");

// 然后在遍历所有html中所有图片

for (Element img : imgs) {

String attr = img.attr("abs:src");

next.add(attr);

}

} // 在判断如果是起始图片就直接下载

else if (contentType.startsWith("image")) {

//进行切割加成名称

String extensionName = contentType.split("/")[1];

String imageFileName = imageId.incrementAndGet() + "." + extensionName;

File imageFile = new File(downloadDir, imageFileName);

try {

FileUtils.writeFile(imageFile,page.getContent());

System.out.println("保存图片" + page.getUrl() + "到" +imageFile.getAbsolutePath());

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

public static void main(String[] args) throws Exception {

DemoImageCrawler demoImageCrawler = new DemoImageCrawler("crawlPath", "downloadPath");

// 添加url

demoImageCrawler.addSeed("可爱大胸妹子夏笑笑傲人美乳无比诱人_妹子图");

// 添加爬取范围

demoImageCrawler.addRegex("http://www.mmjpg.com/mm/*");

// 设置每次爬取都从新开始

demoImageCrawler.setResumable(true);

// 每次开启了30个线程

demoImageCrawler.setThreads(30);

// 大小

Config.MAX_RECEIVE_SIZE = 1000 * 1000 * 10;

// 开发

demoImageCrawler.start(3);

System.out.println(demoImageCrawler);

}

/**

* 计算图片的id

public void computeImageId() {

int maxId = 1;

// 遍历出来所有图片文件

for (File imageFile : downloadDir.listFiles()) {

// 获取图片的名称

String fileName = imageFile.getName();

// 获取到名字,xxx\xxx.jsp,进行切割

String idStr = fileName.split("\\.")[0];

// 获取数字来做名称

Integer id = Integer.valueOf(maxId);

if (id > maxId) {

id = maxId;

}

// 创建一个新的原来数字

imageId = new AtomicInteger(maxId);

}

Java开源爬虫框架WebCollector图片抓取教程

你可能感兴趣的:(Java开源爬虫框架WebCollector图片抓取教程)