利用Jsoup爬取网站的图片,保存到本地

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

学习的过程中可能会用到其他的API,下面一个类做一下简单的测试

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
public class JsoupTest {
public static void main(String[] args) throws IOException {
    JsoupTest jsoupTest = new JsoupTest();
    String url = "http://tieba.baidu.com/p/4549504175";
    // 1.jsoup 的简单应用
    jsoupTest.getHtmlElements(url);
}
 
private static int count = 0;
 
// 爬取网络的图片到本地
public void saveToFile(String destUrl) {
 
    FileOutputStream fos = null;
    BufferedInputStream bis = null;
    HttpURLConnection httpUrl = null;
    URL url = null;
    int BUFFER_SIZE = 1024;
    byte[] buf = new byte[BUFFER_SIZE];
    int size = 0;
    try {
        url = new URL(destUrl);
        httpUrl = (HttpURLConnection) url.openConnection();
        httpUrl.connect();
        bis = new BufferedInputStream(httpUrl.getInputStream());
        String imgName = destUrl.substring(7, destUrl.lastIndexOf("."));
        System.out.println(imgName);
        File dir = new File("f://img");
        if (!dir.exists()) {
            dir.mkdirs();
        }
        File file = new File("f:\\img\\haha" + count + ".jpg");
        System.out.println(file.getAbsolutePath());
 
        fos = new FileOutputStream(file);
        while ((size = bis.read(buf)) != -1) {
            fos.write(buf, 0, size);
        }
        fos.flush();
    } catch (IOException e) {
        System.out.println("IOException");
    } catch (ClassCastException e) {
        System.out.println("ClassCastException");
    } finally {
        count++;
        try {
            fos.close();
            bis.close();
            httpUrl.disconnect();
        } catch (IOException e) {
        } catch (NullPointerException e) {
        }
    }
}
 
    // 解析url的元素
    private void getHtmlElements(String url) {
        try {
            Document doc = Jsoup.connect(url).get();
            // 获取后缀名为jpg的img元素
            Elements pngs = doc.select("img[src$=.jpg]");
            for (Element element : pngs) {
                saveToFile(element.attr("src"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

参考资料

1.http://www.open-open.com/jsoup/parsing-a-document.htm

2.http://blog.csdn.net/withiter/article/details/15339579

3.http://blog.csdn.net/csh159/article/details/7310009

你可能感兴趣的:(java)