文章来源:http://www.oschina.net/code/snippet_1447924_45939
多线程部分根据网络代码改编,自己添加Jsoap 模块支持代理,jsoup-1.8.1.jar(需要的jar包在网上)
1、java代码:
package constant; public class Constant { public static final String proxyHost = "*.*.*.*";//代理IP地址 public static final String proxyPort = "8080";//代理端口 public static final String AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79"; public static final String IMAGE_PATH = "D:\\IMAGE";//图片存放地址 }2、java代码:
package downloadImg; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.List; import constant.Constant; import proxy.UseProxy; public class DownloadImage implements Runnable { private int imageCount = 0; private File image = null; private URL imageUrl = null; private List<String> images = null; private BufferedInputStream inputStream = null; private BufferedOutputStream outputStream = null; public DownloadImage(List<String> image) { this.images = image; } @Override public void run() { // TODO Auto-generated method stub SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HHmmssSSS"); try { while (!images.isEmpty()) { new UseProxy(); imageUrl = new URL(images.remove(0)); imageUrl.openConnection().setConnectTimeout(12000); imageUrl.openConnection().setReadTimeout(12000); inputStream = new BufferedInputStream(imageUrl.openStream()); image = new File(Constant.IMAGE_PATH + "\\" /*+ dateFormat.format(new Date())*/+ getFileName(imageUrl)); if (!image.getParentFile().exists()) { image.getParentFile().mkdirs(); } outputStream = new BufferedOutputStream(new FileOutputStream(image)); byte[] buf = new byte[2048]; int length = inputStream.read(buf); while (length != -1) { outputStream.write(buf, 0, length); length = inputStream.read(buf); } next(); } // wait(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { System.out.println("链接解析失败---" + imageUrl); e.printStackTrace(); } finally { try { next(); } catch (IOException e) { e.printStackTrace(); } } } private String getFileName(URL url) { String fileName = url.getFile(); return fileName.substring(fileName.lastIndexOf('/') + 1); } public void next() throws IOException { if (inputStream != null) { inputStream.close(); } if (outputStream != null) { outputStream.close(); } image = null; // images = null; imageUrl = null; inputStream = null; outputStream = null; System.gc(); System.out.println("DownloadImage >>> " + ++imageCount); } }3、java代码:
package myjsoap; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import proxy.UseProxy; import constant.Constant; import downloadImg.DownloadImage; public class DealUrl implements Runnable{ // 已解析url队列 private List<String> visited = null; // 未解析url队列 private List<String> hrefs = null; // 图片链接队列 private List<String> images = null; //已解析链接数 private int analyze = 0; private int count = 0; public DealUrl(List<String> hrefs, List<String> visited, List<String> images) { this.hrefs = hrefs; this.visited = visited; this.images = images; } public void run() { while (!hrefs.isEmpty()) { // 把当前要解析的url字符串从hrefs移到visited String urlTmp = hrefs.remove(0); if (visited.indexOf(urlTmp) != -1) continue; visited.add(urlTmp); Document doc = getUrlDoc((String) visited.get(visited.size() - 1)); if (doc == null) continue; System.out.println("已解析第 " + ++analyze + " 个连接。。。"+urlTmp); Elements hrefLinks = doc.select("a[href]"); Elements imgLinks = doc.select("img[src]"); if (hrefLinks != null) for (Element link : hrefLinks) { String newUrl = link.attr("abs:href"); if (newUrl.indexOf("ququ") != -1) hrefs.add(newUrl); // System.out.println(++count + " >>> " + // link.attr("abs:href")); } if (imgLinks == null) continue; for (Element link : imgLinks) { String temImgUrl = link.attr("abs:src"); if (temImgUrl.indexOf(".jpg") != -1 && images.indexOf(temImgUrl) == -1) { images.add(link.attr("abs:src")); System.out.println("img:"+link.attr("abs:src")); } } new Thread(new DownloadImage(images)).start(); } System.gc(); } public Document getUrlDoc(String url){ Document doc = null; try { new UseProxy();//不是代理上网的可以注释掉 Connection conneciton = Jsoup.connect(url); conneciton.userAgent(Constant.AGENT); doc = conneciton.get(); } catch (Exception e) { System.out.println("connect fail!"); return null; } return doc; } }
package proxy; import java.util.Properties; import constant.Constant; public class UseProxy { public UseProxy() { Properties prop = System.getProperties(); prop.setProperty("http.proxyHost", Constant.proxyHost); prop.setProperty("http.proxyPort", Constant.proxyPort); } }5、java代码:
package start; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import myjsoap.DealUrl; public class MyRobot { private List<String> hrefs = Collections.synchronizedList(new ArrayList<String>()); private List<String> visited = Collections.synchronizedList(new ArrayList<String>()); private List<String> images = Collections.synchronizedList(new ArrayList<String>()); public MyRobot(String href) { hrefs.add(href); } public void run() throws InterruptedException { ExecutorService pool = Executors.newFixedThreadPool(2); pool.execute(new DealUrl(hrefs, visited, images)); Thread.sleep(8000); pool.execute(new DealUrl(hrefs, visited, images)); pool.shutdown(); } public static void main(String[] args) throws InterruptedException { MyRobot robot = new MyRobot("http://500ququ.com/"); robot.run(); } }