如何用爬虫webmagic采集海量美图(demo附源代码)(二)

完整代码见:  http://www.oschina.net/code/snippet_1397325_35514

如何用爬虫webmagic采集海量美图(demo附源代码)(一)

链接: http://my.oschina.net/u/1397325/blog/233695


5 执行持久化的ImgPipeline.java

import org.apache.http.annotation.ThreadSafe;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import java.io.IOException;


@ThreadSafe
public class ImgPipeline extends FilePersistentBase implements Pipeline {

    private Logger logger = LoggerFactory.getLogger(getClass());
    public ImgPipeline() {
        setPath("/data/webmagic/");
    }

    public ImgPipeline(String path) {
        setPath(path);
    }


    @Override
    public void process(ResultItems resultItems, Task task) {
        String fileStorePath = this.path;
        try {

            String imgShortNameNew="(http://www.meizitu.com/wp-content/uploads/)|(jpg)";
            CloseableHttpClient httpclient = HttpClients.createDefault();

            for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {

                if (entry.getValue() instanceof List) {

                    List listOne= (List) entry.getValue();
                    List<String> list = new ArrayList<String>();

                    for(int i=0;i<listOne.size();i++){
                        list.add((String)listOne.get(i));
                    }

                    for(int i=1;i<list.size();i++)
                    {

                        StringBuffer sb = new StringBuffer();
                        StringBuffer imgFileNameNewYuan =sb.append(fileStorePath)
                                .append(list.get(0)) //此处提取文件夹名,即之前采集的标题名
                                .append("\\");
                        //这里先判断文件夹名是否存在,不存在则建立相应文件夹
                        Path target = Paths.get(imgFileNameNewYuan.toString());
                        if(!Files.isReadable(target)){
                            Files.createDirectory(target);
                        }

                        String extName=com.google.common.io
                                       .Files.getFileExtension(list.get(i));
                        StringBuffer imgFileNameNew = imgFileNameNewYuan
                                .append((list.get(i)).replaceAll(imgShortNameNew, "")
                                .replaceAll("[\\pP‘’“”]", ""))
                                .append(".")
                                .append(extName);

                        //这里通过httpclient下载之前抓取到的图片网址,并放在对应的文件中
                        HttpGet httpget = new HttpGet(list.get(i));
                        HttpResponse response = httpclient.execute(httpget);
                        HttpEntity entity = response.getEntity();
                        InputStream in = entity.getContent();

                        File file = new File(imgFileNameNew.toString());

                        try {
                            FileOutputStream fout = new FileOutputStream(file);
                            int l = -1;
                            byte[] tmp = new byte[1024];
                            while ((l = in.read(tmp)) != -1) {
                                fout.write(tmp,0,l);
                            }
                            fout.flush();
                            fout.close();
                        } finally {

                            in.close();
                        }

                    }
                }

                else {
                    System.out.println(entry.getKey() + ":\t" + entry.getValue());
                }
            }
            httpclient.close();
        } catch (IOException e) {
            logger.warn("write file error", e);
        }
    }
}


6 客户类ImgSpiderTest.java

import us.codecraft.webmagic.Spider;


public class ImgSpiderTest {

    public static void main(String[] args){

        String fileStorePath = "D:\\webmagic-lib\\data\\imgNew7\\";
        String urlPattern = "http://www.meizitu.com/[a-z]/[0-9]{1,4}.html";
        ImgProcessor imgspider=new ImgProcessor("http://www.meizitu.com/",urlPattern);

        //webmagic采集图片代码演示,相关网站仅做代码测试之用,请勿过量采集
        Spider.create(imgspider)
                .addUrl("http://www.meizitu.com/")
                .addPipeline(new ImgPipeline(fileStorePath))
                .thread(10)       //此处线程数可调节
                .run();

    }
}


以上在jdk7中测试通过。

你可能感兴趣的:(如何用爬虫webmagic采集海量美图(demo附源代码)(二))