多线程爬虫Java调用wget下载文件,独立线程读取输出缓冲区

写了个抓取appstore的,要抓取大量的app,本来是用httpclient,但是效果不理想,于是直接调用wget下载,但是由于标准输出、错误输出的原因会导致卡住,另外wget也会莫名的卡住。

所以我采用:

一、独立线程读取输出信息;

二、自己实现doWaitFor方法来代替api提供的waitFor()方法,避免子进程卡死。

三、设置超时,杀死wget子进程,没有正确返回的话,重试一次,并把超时时间加倍;

 有了以上操作,wget不会卡死,就算卡住了也会因为超时被干掉再重试一次,所以绝大部分的app可以被抓取下来。

import com.google.common.io.Files;

import com.xxx.appstore.service.crawler.CalcMD5Service;

import org.apache.commons.lang.StringUtils;

import org.apache.commons.lang.math.RandomUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;



import java.io.File;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.concurrent.TimeUnit;





public class CrawlerUtils {



    public static final String APK_DOWNLOAD_PATH = "/data/appstore/category/";

    private static Logger LOGGER = LoggerFactory.getLogger(CrawlerUtils.class);



    /**

     * 使用wget下载文件

     *

     * @param displayName  appName

     * @param category     分类

     * @param download_url 下载地址

     * @return 成功返回文件路径,失败返回null

     */

    public static String downloadFileByWget(String displayName, String category, String download_url) {

        if (StringUtils.isBlank(displayName) || StringUtils.isBlank(category) || StringUtils.isBlank(download_url)) {

            LOGGER.info("downloadFileByWget ERROR, displayName:{}, category:{}, download_url:{}", new Object[]{displayName, category, download_url});

            return null;

        }

        String fileName = CalcMD5Service.encoder(displayName + RandomUtils.nextInt(1000));

        String seed = CalcMD5Service.encoder(category);

        String midPath = StringUtils.left(seed, 10);

        String filePath = APK_DOWNLOAD_PATH + midPath + "/" + fileName + ".apk";

        File file = new File(filePath);

        try {

            Files.createParentDirs(file);

        } catch (IOException e) {

            LOGGER.warn("IOException", e);

            return null;

        }

        int retry = 2;

        int res = -1;

        int time = 1;

        while (retry-- > 0) {

            ProcessBuilder pb = new ProcessBuilder("wget", download_url, "-t", "2", "-T", "10", "-O", filePath);

            LOGGER.info("wget shell: {}", pb.command());

            Process ps = null;

            try {

                ps = pb.start();

            } catch (IOException e) {

                LOGGER.error("IOException", e);

            }

            res = doWaitFor(ps, 30 * time++);

            if (res != 0) {

                LOGGER.warn("Wget download failed...");

            } else {

                break;

            }

        }

        if (res != 0) {

            return null;

        }

        return filePath;

    }





    /**

     * @param ps      sub process

     * @param timeout 超时时间,SECONDS

     * @return 正常结束返回0

     */

    private static int doWaitFor(Process ps, int timeout) {

        int res = -1;

        if (ps == null) {

            return res;

        }

        List<String> stdoutList = new ArrayList<>();

        List<String> erroroutList = new ArrayList<>();

        boolean finished = false;

        int time = 0;

        ThreadUtil stdoutUtil = new ThreadUtil(ps.getInputStream(), stdoutList);

        ThreadUtil erroroutUtil = new ThreadUtil(ps.getErrorStream(), erroroutList);

        //启动线程读取缓冲区数据

        stdoutUtil.start();

        erroroutUtil.start();

        while (!finished) {

            time++;

            if (time >= timeout) {

                LOGGER.info("Process wget timeout 30s, destroyed!");

                ps.destroy();

                break;

            }

            try {

                res = ps.exitValue();

                finished = true;

            } catch (IllegalThreadStateException e) {

                try {

                    TimeUnit.SECONDS.sleep(1);

                } catch (InterruptedException e1) {



                }

            }

        }

        return res;

    }

}

 

import org.apache.commons.io.Charsets;



import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.List;





public class ThreadUtil implements Runnable {

    // 设置读取的字符编码

    private String character = Charsets.UTF_8.displayName();

    private List<String> list;

    private InputStream inputStream;



    public ThreadUtil(InputStream inputStream, List<String> list) {

        this.inputStream = inputStream;

        this.list = list;

    }



    public void start() {

        Thread thread = new Thread(this);

        thread.setDaemon(true);//将其设置为守护线程

        thread.start();

    }



    public void run() {

        BufferedReader br = null;

        try {

            br = new BufferedReader(new InputStreamReader(inputStream, character));

            String line = null;

            while ((line = br.readLine()) != null) {

                list.add(line);

            }

        } catch (IOException e) {

            e.printStackTrace();

        } finally {

            try {

                //释放资源

                inputStream.close();

                if (br != null) {

                    br.close();

                }

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }



}

 

你可能感兴趣的:(java)