Java下载文件 爬虫 超时处理解决方案

import java.util.List;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.SocketTimeoutException;

import java.net.URL;

import java.util.ArrayList;

import java.util.logging.Logger;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



public class Main {



 public static final int sleepMsPerConnection = 1000;

 public static final int timeOutMs = 20000;

 public static final int retry = 2;



 private static void download(String urlStr, String filePath) {

  int retryCount = 0;

  while(true){

   try {

    DownloadThread thread = new DownloadThread(urlStr, filePath);

    thread.start();

    thread.join(timeOutMs);

    if(!thread.isAlive()){

     return;

    }else{

     thread.interrupt();//实测并不能结束线程,请参考如何中断JAVA线程一文

    }

   } catch (InterruptedException e) {

    e.printStackTrace();

   }

   retryCount++;

   if(retryCount > retry){

    throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

   }

   System.out.println("retry");

  }

 }





 private static String getHtml(String urlStr) {

  int retryCount = 0;

  while(true){

   try {

    GetHtmlThread thread = new GetHtmlThread(urlStr);

    thread.start();

    thread.join(timeOutMs);

    if(!thread.isAlive()){

     return thread.html;

    }else{

     thread.interrupt();

    }

   } catch (InterruptedException e) {

    e.printStackTrace();

   }

   retryCount++;

   if(retryCount > retry){

    throw new RuntimeException("still timeout after retry " + (retry - 1) + " times");

   }

   System.out.println("retry");

  }

 }

}



import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.net.URL;



public class GetHtmlThread extends Thread {



 public String html;

 private String urlStr;



 public GetHtmlThread(String urlStr) {

  this.urlStr = urlStr;

 }



 public void run() {

  try {

   Thread.sleep(Main.sleepMsPerConnection);

   URL url = new URL(urlStr);

   StringBuilder sb = new StringBuilder();

   BufferedReader br = new BufferedReader(new InputStreamReader(url

     .openStream()));

   String line = null;

   while ((line = br.readLine()) != null) {

    sb.append(line);

    sb.append('\n');

   }

   br.close();

   this.html = sb.toString();

  } catch (InterruptedException e) {

   // do nothing?

  } catch (Exception e) {

   e.printStackTrace();

   System.exit(1);

  }

 }

}



import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.net.URL;



public class DownloadThread extends Thread {



 private String urlStr;

 private String filePath;



 public DownloadThread(String urlStr, String filePath) {

  this.urlStr = urlStr;

  this.filePath = filePath;

 }



 public void run() {

  try {

   URL url = new URL(urlStr);

   InputStream is = url.openStream();

   File pdfFile = new File(filePath);

   FileOutputStream os = new FileOutputStream(pdfFile);

   copyStream(is, os);

   is.close();

   os.close();

  } catch (Exception e) {

   e.printStackTrace();

   System.exit(1);

  }

 }

 

 /**

  * still need to close inputstream and outputstream after call this method

  * @param inputStream

  * @param outputStream

  * @throws IOException

  */

 private void copyStream(InputStream inputStream, OutputStream outputStream)

   throws IOException {

  byte[] b = new byte[1024];

  int len;

  while ((len = inputStream.read(b)) > 0) {

   outputStream.write(b, 0, len);

  }

  outputStream.flush();

 }

}

你可能感兴趣的:(java)