java爬图片数据 demo

package com.xcx.spots.test;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;

import us.codecraft.webmagic.selector.Selectable;
public class Danli implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

@Override
public void process(Page page) {
    //如果页面是列表页
    if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) {

        //得到列表图片的所有链接
        page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all());

        // 获取下一页,倒数第个a标签
        // 规定属于其父元素的第二个子元素的每个 p 元素,从最后一个子元素开始计数:
        //这样就拿到了下一页的link
        page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString());

        //详情页                                                   匹配括号内的数字
    } else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) {
        // 得到详情页里面的下一页按钮的  href
        Selectable links = page.getHtml().$("#nl > a").links();

        if (links != null )
            page.addTargetRequest(links.toString());
        // 抓取图片内容p标签
        String img = page.getHtml().$("#big-pic p img").toString();
        //细查a标签
        if (img == "null")
            //img标签直接使用link()不能直接获取,所以只能获取到内容,进行内容的截取
            img = page.getHtml().$("#big-pic a img").toString();
        //截取从s开始计,+5就刚好到h的位置
        img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2);

        page.putField("img", img);



    }
}

@Override
public Site getSite() {
    return site;
}

public static void main(String[] args) {
    Spider.create(new Danli())
            .addUrl("http://www.mmonly.cc/ktmh/dmmn/")
            .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000)))
            .addPipeline(new Demo()).thread(5).run();
}

}
package com.xcx.spots.test;

/**

  • @ProjectName: spots

  • @Package: com.xcx.spots.test

  • @ClassName: Demo

  • @Author: nh

  • @Description:

  • @Date: 2020/7/2 13:48

  • @Version: 1.0
    */
    import us.codecraft.webmagic.ResultItems;
    import us.codecraft.webmagic.Task;
    import us.codecraft.webmagic.pipeline.Pipeline;
    public class Demo implements Pipeline {
    @Override
    public void process(ResultItems resultItems, Task task) {

     String url = resultItems.get("img").toString();
     UrlFileDownloadUtil.downloadPicture(url);
    

    }
    }
    package com.xcx.spots.test;

/**

  • @ProjectName: spots
  • @Package: com.xcx.spots.test
  • @ClassName: UrlFileDownloadUtil
  • @Author: nh
  • @Description:
  • @Date: 2020/7/2 13:48
  • @Version: 1.0
    /
    import java.io.
    ;
    import java.net.URL;
    import java.util.UUID;

public class UrlFileDownloadUtil {

public static void downloadPicture(String url) {
    String file = "D:\\image\\";
    try {
        URL u = new URL(url);
        String name = UUID.randomUUID().toString();

        DataInputStream inputStream = new DataInputStream(u.openStream());
        FileOutputStream outputStream = new FileOutputStream(file+name+".jpg");

        byte [] bytes = new byte[1024*100];
        int length ;
        while ((length=inputStream.read(bytes))>0){
            outputStream.write(bytes,0,length);
        }
        System.out.println("下载完成:"+file+name+".jpg");

        inputStream.close();
        outputStream.close();
    } catch (  Exception e) {
        e.printStackTrace();
    }
}

}

你可能感兴趣的:(java)