使用HttpClient和Jsoup爬取某网的妹子图片

工具:

 - HttpCilent     模拟发送请求,获取网站Html数据
 - Jsoup          解析Html数据,获取图片链接
 - Firebug        查看页面信息,寻找爬取规律

代码:

package ren.hz.spider.mzitu;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class GetInfoFromMz {

    public String getInfo() {

        //创建HttpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //新建Get请求
        HttpGet get = new HttpGet("http://www.mzitu.com/all");
        //创建响应接受
        CloseableHttpResponse response;
        try {
            //执行请求
            response = httpClient.execute(get);
            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String html = EntityUtils.toString(entity);

                //使用Jsoup解析返回的html
                Document document = Jsoup.parse(html);
                //获取相应dom标签
                Elements div_all = document.select("div.all");
                //获取该节点下的所有a标签
                Elements as = div_all.select("a");
                //输出获取的标签数
                System.out.println(as.size());
                for (Element a : as) {
                    //获取a标签文字内容,去除空格,作为保存文件名
                    String title = a.text().trim();
                    //获取a标签链接
                    String link = a.attr("href");
                    //本地目录
                    String path = "E:/mzitu/" + title;
                    HttpGet get2 = new HttpGet(link);
                    response = httpClient.execute(get2);
                    Document document2 = Jsoup.parse(EntityUtils.toString(response.getEntity()));
                    //根据页面源码可以知道最大页码值在第21个span处
                    String max_span = document2.select("span").get(10).text();
                    //组装url
                    for (int i = 1; i < Integer.valueOf(max_span) + 1; i++) {
                        String url = link + "/" + i;
                        if (i == 1) {
                            url = link;
                        }
                        System.out.println(url);
                        HttpGet get3 = new HttpGet(url);
                        response = httpClient.execute(get3);
                        Document document3 = Jsoup.parse(EntityUtils.toString(response.getEntity()));
                        //获取图片地址
                        String img_url = document3.select("div.main-image").select("img").attr("src");
                        //根据图片地址,使用流的方式获取图片并存盘
                        InputStream ipt = httpClient.execute(new HttpGet(img_url)).getEntity().getContent();
                        File file = new File(path + "/" + img_url.substring(img_url.lastIndexOf("/")));
                        if (!file.exists()) {
                            if (!file.getParentFile().exists()) {
                                file.getParentFile().mkdirs();
                            }
                            file.createNewFile();
                        }
                        FileOutputStream fileOutputStream = new FileOutputStream(file);
                        byte[] bytes = new byte[1024];
                        int j = 0;
                        while ((j = ipt.read(bytes)) != -1) {
                            fileOutputStream.write(bytes, 0, j);
                        }
                        fileOutputStream.flush();
                        fileOutputStream.close();
                    }
                }
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return null;
    }

    public static void main(String[] args) {
        new GetInfoFromMz().getInfo();
    }

}

你可能感兴趣的:(HttpClient)