(官网似乎已改版,此代码没用了)
1、pom文件配置或者添加jsoup1.6.3jar包:
<dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.6.3</version> </dependency> </dependencies>
2、 抓取长颈鹿但丁图片URL:
package com.sxit.jsoup; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 功能:抓取长颈鹿但丁图片 * 类名:jsoupPic * 作者:smile * 时间:Nov 11, 2012:2:17:57 PM */ public class jsoupPic { public static List<String> getDocument() { List<String> list = new ArrayList<String>(); try { Connection con = null; // 分页后缀 String[] a = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" }; // 遍历语录漫画下面四个项目(哲理、职场、爱情、恶搞) for (int i = 1; i <= 4; i++) { // 单个项目的第一页url String url = "http://www.danding.com.cn/pic_fl_" + i + ".html"; // 建立连接 con = Jsoup.connect(url); // 获取页面document Document doc = con.get(); // 获取末尾是jpg的img标签元素 Elements e = doc.select("img[src$=.jpg]"); // 遍历第一页jpg图片的路径 for (int j = 0; j < e.size(); j++) { Element ei = e.get(j); // System.out.println("第"+i+"页图片地址为----------->>>>> // http://www.danding.com.cn/"+ei.attr("src")); list.add("http://www.danding.com.cn/" + ei.attr("src")); } int flag = 0; while (flag == 0) { // 当前页是否存在下一页 boolean isExist = true; isExist = isExistsNextPage(doc); int k = 0; while (isExist) { // System.out.println("----------------->>>存在下一页"); // 下一页的url地址 url = "http://www.danding.com.cn/pic_fl_" + i + a[k] + ".html"; doc = traverse(url, list); isExist = isExistsNextPage(doc); k++; } flag = 1; } } } catch (IOException e) { e.printStackTrace(); } return list; } /** * 判断是否有下一页 * * @param doc * @return */ public static boolean isExistsNextPage(Document doc) { // 判断当前页是否还有下一页 Elements e = doc.select(":containsOwn(下一页)"); if (e.size() > 0) { // 有下一页 return true; } else return false; } /** * 遍历document * * @param list * @param doc * @throws IOException */ public static Document traverse(String src, List<String> list) throws IOException { Connection con = Jsoup.connect(src); Document doc = con.get(); // 获取末尾是jpg的标签元素 Elements e = doc.select("img[src$=.jpg]"); for (int j = 0; j < e.size(); j++) { Element ei = e.get(j); list.add("http://www.danding.com.cn/" + ei.attr("src")); } return doc; } }
3、批量下载到本地:
package com.sxit.jsoup; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; /** * 功能:批量下载 * 类名:ThreadPoolManage * 作者:smile * 时间:Nov 11, 2012:6:02:49 PM */ public class ThreadPoolManage { final ExecutorService exec = Executors.newFixedThreadPool(20); private String filePath; private List<String> list; // 结束的倒数锁 final CountDownLatch stop = new CountDownLatch(20); public static void main(String[] args) { new ThreadPoolManage("D://xxooThread"); } public ThreadPoolManage(String filePath) { list = jsoupPic.getDocument(); // 启20个线程跑,每个线程只跑总数的1/20,第一个线程跑1-->list.size()/20, // 第二个线程从list.size()/20+1-->2*list.size()/20,最后一个线程则跑(n-1)*list.size()/20+1-->list.size() for (int i = 1; i <= 20; i++) { this.exec.submit(new ImageThread(i, filePath, list, stop)); } try { // 等待stop变为0 stop.await(); } catch (InterruptedException e) { e.printStackTrace(); } // 等所有线程跑完最后关闭ExecutorService exec.shutdown(); } } class ImageThread implements Runnable { private String filePath; private List<String> list; private int index; private final CountDownLatch stop; public ImageThread(int index, String filePath, List<String> ilistst, CountDownLatch stop) { this.index = index; this.filePath = filePath; this.list = ilistst; this.stop = stop; } public void run() { String picUrl = ""; InputStream is = null; OutputStream os = null; URL url = null; HttpURLConnection con = null; // 判断保存路径是否存在 不存在则新建文件夹 File f = new File(filePath); File temp = null; if (!f.exists()) { f.mkdir(); } if (list != null) { // 每次需要跑的数目 int count = list.size() / 20; int start = (index - 1) * count + 1; int end = 0; if (index != 20) { end = index * count; } else { end = list.size() - 1; } for (int i = start; i <= end; i++) { picUrl = list.get(i); try { url = new URL(picUrl); con = (HttpURLConnection) url.openConnection(); // 设置连接超时 con.setConnectTimeout(100 * 1000); // 设置读取超时 con.setReadTimeout(100 * 1000); is = new BufferedInputStream(con.getInputStream()); os = new BufferedOutputStream(new FileOutputStream(new File(filePath + "/" + i + ".jpg"))); byte[] b = new byte[1024]; int length = 0; while ((length = is.read(b)) != -1) { os.write(b, 0, length); } os.flush(); System.out.println(index + "号线程----------------->>>>>>>保存完第" + i + "张"); } catch (Exception e) { System.out.println(index + "号线程跑到第" + start + "张图片+++++++++++++抛出异常,异常信息为:" + e.getMessage()); // 抛出异常捕获,继续执行 continue; } } try { if (is != null) { is.close(); } if (os != null) { os.close(); } } catch (IOException e) { e.printStackTrace(); } finally { // 当前线程完成,减1 this.stop.countDown(); } } } }
3、源码如下