写了个抓取appstore的,要抓取大量的app,本来是用httpclient,但是效果不理想,于是直接调用wget下载,但是由于标准输出、错误输出的原因会导致卡住,另外wget也会莫名的卡住。
所以我采用:
一、独立线程读取输出信息;
二、自己实现doWaitFor方法来代替api提供的waitFor()方法,避免子进程卡死。
三、设置超时,杀死wget子进程,没有正确返回的话,重试一次,并把超时时间加倍;
有了以上操作,wget不会卡死,就算卡住了也会因为超时被干掉再重试一次,所以绝大部分的app可以被抓取下来。
import com.google.common.io.Files; import com.xxx.appstore.service.crawler.CalcMD5Service; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.RandomUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; public class CrawlerUtils { public static final String APK_DOWNLOAD_PATH = "/data/appstore/category/"; private static Logger LOGGER = LoggerFactory.getLogger(CrawlerUtils.class); /** * 使用wget下载文件 * * @param displayName appName * @param category 分类 * @param download_url 下载地址 * @return 成功返回文件路径,失败返回null */ public static String downloadFileByWget(String displayName, String category, String download_url) { if (StringUtils.isBlank(displayName) || StringUtils.isBlank(category) || StringUtils.isBlank(download_url)) { LOGGER.info("downloadFileByWget ERROR, displayName:{}, category:{}, download_url:{}", new Object[]{displayName, category, download_url}); return null; } String fileName = CalcMD5Service.encoder(displayName + RandomUtils.nextInt(1000)); String seed = CalcMD5Service.encoder(category); String midPath = StringUtils.left(seed, 10); String filePath = APK_DOWNLOAD_PATH + midPath + "/" + fileName + ".apk"; File file = new File(filePath); try { Files.createParentDirs(file); } catch (IOException e) { LOGGER.warn("IOException", e); return null; } int retry = 2; int res = -1; int time = 1; while (retry-- > 0) { ProcessBuilder pb = new ProcessBuilder("wget", download_url, "-t", "2", "-T", "10", "-O", filePath); LOGGER.info("wget shell: {}", pb.command()); Process ps = null; try { ps = pb.start(); } catch (IOException e) { LOGGER.error("IOException", e); } res = doWaitFor(ps, 30 * time++); if (res != 0) { LOGGER.warn("Wget download failed..."); } else { break; } } if (res != 0) { return null; } return filePath; } /** * @param ps sub process * @param timeout 超时时间,SECONDS * @return 正常结束返回0 */ private static int doWaitFor(Process ps, int timeout) { int res = -1; if (ps == null) { return res; } List<String> stdoutList = new ArrayList<>(); List<String> erroroutList = new ArrayList<>(); boolean finished = false; int time = 0; ThreadUtil stdoutUtil = new ThreadUtil(ps.getInputStream(), stdoutList); ThreadUtil erroroutUtil = new ThreadUtil(ps.getErrorStream(), erroroutList); //启动线程读取缓冲区数据 stdoutUtil.start(); erroroutUtil.start(); while (!finished) { time++; if (time >= timeout) { LOGGER.info("Process wget timeout 30s, destroyed!"); ps.destroy(); break; } try { res = ps.exitValue(); finished = true; } catch (IllegalThreadStateException e) { try { TimeUnit.SECONDS.sleep(1); } catch (InterruptedException e1) { } } } return res; } }
import org.apache.commons.io.Charsets; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.List; public class ThreadUtil implements Runnable { // 设置读取的字符编码 private String character = Charsets.UTF_8.displayName(); private List<String> list; private InputStream inputStream; public ThreadUtil(InputStream inputStream, List<String> list) { this.inputStream = inputStream; this.list = list; } public void start() { Thread thread = new Thread(this); thread.setDaemon(true);//将其设置为守护线程 thread.start(); } public void run() { BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(inputStream, character)); String line = null; while ((line = br.readLine()) != null) { list.add(line); } } catch (IOException e) { e.printStackTrace(); } finally { try { //释放资源 inputStream.close(); if (br != null) { br.close(); } } catch (IOException e) { e.printStackTrace(); } } } }