Java爬虫框架 WebCollector-2.7.3 爬取网页图片Demo

WebCollector框架Github地址:https://github.com/CrawlScript/WebCollector

Demo源码

package com.collector;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import okhttp3.Request;

import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Crawling picture from web pages
 *
 * @author he
 */
public class AutoPicCrawler extends BreadthCrawler {
    
    File downloadDir;
	
	private final static String downPath = "E:/toolSource/picFile";
	
	private final static String seed = "https://www.csdn.net";
	
	public static class MyRequester extends OkHttpRequester {
		 
        String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
        String cookie = "JSESSIONID=asdasdasdasdasdasdasdsadsa";
        // 每次发送请求前都会执行这个方法来构建请求
        @Override
        public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
            return super.createRequestBuilder(crawlDatum)
                    .addHeader("User-Agent", userAgent)
                    .addHeader("Cookie", cookie);
        }
 
    }
	
    public AutoPicCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);      
        downloadDir = new File(downPath);
        if (!downloadDir.exists()) {
            downloadDir.mkdirs();
        }
        this.setRequester(new MyRequester());
        this.addSeed(seed);
        setThreads(50);
        getConf().setTopN(100);

    }

    @Override
    public void visit(Page page, CrawlDatums next) {
    	if(page.code() == 301 || page.code() == 302){
            next.addAndReturn(page.location()).meta(page.meta());
            return;
        }

        String url = page.url();
        System.out.println("url:"+url);
        String contentType = page.contentType();
        System.out.println("contentType:"+contentType);
        if (contentType == null) {
            return;
        } else if (contentType.contains("html")) {
            // 如果是网页,则抽取其中包含图片的URL,放入后续任务
            Elements imgs = page.select("img[src]");
            for (Element img : imgs) {
                String imgSrc = img.attr("abs:src");
                System.out.println("imgSrc:"+imgSrc);
                next.add(imgSrc);
            }
 
        } else if (contentType.startsWith("image")) {
            // 如果是图片,直接下载
            String extensionName = contentType.split("/")[1];
            String imageFileName = getTimeCodeName() + "." + extensionName;
            File imageFile = new File(downloadDir, imageFileName);
            try {
                FileUtils.write(imageFile, page.content());
                System.out.println("保存图片 " + page.url() + " 到 " + imageFile.getAbsolutePath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
        
    }

    public static void main(String[] args) throws Exception {
    	AutoPicCrawler crawler = new AutoPicCrawler("crawl", true);
        /*start crawl with depth of 4*/
        crawler.start(4);
    }
    
    /**
     * @return create filename by system time
     */
    public static String getTimeCodeName(){
		Date d=new Date();
		DateFormat sdf=new SimpleDateFormat("yyMMddHHmmssSSS");
		String s=sdf.format(d);
		int code=(int) ((Math.random()*9+1)*100);
		String cods=s+code;
		return cods;
	}
    
}

执行效果

Java爬虫框架 WebCollector-2.7.3 爬取网页图片Demo_第1张图片

你可能感兴趣的:(Java)