简易爬虫--360图片爬取

爬取网站http://image.so.com/ 360的图片搜索

先随便输入搜索的图片内容进入网页,发现他的加载是拖动加载图片,浏览器按F12打开调试模式,选择网络模式抓包
这里写图片描述
抓到后点击查看,先查看请求路径与请求方式
简易爬虫--360图片爬取_第1张图片
然后翻到最后看请求参数,get请求也可以直接在url地址上查看请求参数
简易爬虫--360图片爬取_第2张图片
最后分析返回的json参数,拿到所需要的资源
简易爬虫--360图片爬取_第3张图片
也可以通过https://www.bejson.com/ JSON转换器查看JSON串获取内容
简易爬虫--360图片爬取_第4张图片
如果不确定资源是否正确可以复制图片连接在浏览器上打开
简易爬虫--360图片爬取_第5张图片
好了,XJB分析之后上代码:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>
    <groupId>com.dagroupId>
    <artifactId>Spider1artifactId>
    <version>0.0.1-SNAPSHOTversion>
    <packaging>warpackaging>

    <dependencies>
        
        <dependency>
            <groupId>org.apache.httpcomponentsgroupId>
            <artifactId>httpclientartifactId>
            <version>4.5.3version>
        dependency>

        
        <dependency>
            <groupId>com.alibabagroupId>
            <artifactId>fastjsonartifactId>
            <version>1.2.31version>
        dependency>
    dependencies>

project>

依赖文件只需要json解析的工具和httpclient即可

主程序:

package com.da.test;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.da.util.HttpClientUtil;
import com.da.util.UrlFileDownloadUtil;

public class Demo1 {
    public static void main(String[] args) {
        String url = "http://image.so.com/j";
        Map param = new HashMap();
        param.put("q", "蘑菇头表情包");
        param.put("src", "srp");
        param.put("correct", "蘑菇头表情包");
        param.put("pn", "60");
        param.put("sn", "0");
        String string = HttpClientUtil.doGet(url, param);
        // System.out.println(string);
        JSONArray jsonArray = JSON.parseArray(JSON.parseObject(string).get("list").toString());
        // System.out.println(jsonArray.size());
        List urls = new ArrayList<>();
        List names = new ArrayList<>();
        for (int i = 0; i < jsonArray.size(); i++) {
            String href = jsonArray.getJSONObject(i).get("thumb").toString();
            urls.add(href);
            String id = jsonArray.getJSONObject(i).get("id").toString();
            String type = jsonArray.getJSONObject(i).get("imgtype").toString();
            if (type.equalsIgnoreCase("other")) {
                type = href.substring(href.lastIndexOf(".") + 1);
            }
            names.add(id + "." + type.toLowerCase());

        }
        UrlFileDownloadUtil.downloadPicture(urls, names);
    }
}

q与content是你需要搜索的内容,pn是返回多少数据

工具类:

package com.da.util;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class HttpClientUtil {
    private static PoolingHttpClientConnectionManager connMgr;
    private static RequestConfig requestConfig;
    private static final int MAX_TIMEOUT = 5000;

    static {
        // 设置连接池
        connMgr = new PoolingHttpClientConnectionManager();
        // 设置连接池大小
        connMgr.setMaxTotal(200);
        connMgr.setDefaultMaxPerRoute(connMgr.getMaxTotal());

        RequestConfig.Builder configBuilder = RequestConfig.custom();
        // 设置连接超时
        configBuilder.setConnectTimeout(MAX_TIMEOUT);
        // 设置读取超时
        configBuilder.setSocketTimeout(MAX_TIMEOUT);
        // 设置从连接池获取连接实例的超时
        configBuilder.setConnectionRequestTimeout(MAX_TIMEOUT);
        // 在提交请求之前 测试连接是否可用
        // configBuilder.setStaleConnectionCheckEnabled(true);
        // 设置代理
        // configBuilder.setProxy(new HttpHost("119.249.48.235", 80));
        requestConfig = configBuilder.build();
    }

    public static String doGet(String url, Map param) {

        // 创建Httpclient对象
        CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connMgr).build();

        String resultString = "";
        CloseableHttpResponse response = null;
        try {
            // 创建uri
            URIBuilder builder = new URIBuilder(url);
            if (param != null) {
                for (String key : param.keySet()) {
                    builder.addParameter(key, param.get(key));
                }
            }
            URI uri = builder.build();

            // 创建http GET请求
            HttpGet httpGet = new HttpGet(uri);
            httpGet.setConfig(requestConfig);

            // 执行请求
            response = httpclient.execute(httpGet);
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                resultString = EntityUtils.toString(response.getEntity(), "UTF-8");
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return resultString;
    }

    public static String doGet(String url) {
        return doGet(url, null);
    }

    public static String doPost(String url, Map param) {
        // 创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connMgr).build();
        CloseableHttpResponse response = null;
        String resultString = "";
        try {
            // 创建Http Post请求
            HttpPost httpPost = new HttpPost(url);
            httpPost.setConfig(requestConfig);
            // 创建参数列表
            if (param != null) {
                List paramList = new ArrayList<>();
                for (String key : param.keySet()) {
                    paramList.add(new BasicNameValuePair(key, param.get(key)));
                }
                // 模拟表单
                UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
                httpPost.setEntity(entity);
            }
            // 执行http请求
            response = httpClient.execute(httpPost);
            resultString = EntityUtils.toString(response.getEntity(), "utf-8");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return resultString;
    }

    public static String doPost(String url) {
        return doPost(url, null);
    }

    public static String doPostJson(String url, String json) {
        // 创建Httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connMgr).build();
        CloseableHttpResponse response = null;
        String resultString = "";
        try {
            // 创建Http Post请求
            HttpPost httpPost = new HttpPost(url);
            httpPost.setConfig(requestConfig);
            // 创建请求内容
            StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
            httpPost.setEntity(entity);
            // 执行http请求
            response = httpClient.execute(httpPost);
            resultString = EntityUtils.toString(response.getEntity(), "utf-8");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return resultString;
    }
}

httpclient的轮子网上随便找的,大把没必要自己去写

package com.da.util;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

public class UrlFileDownloadUtil {
    /**
     * 传入要下载的图片的url列表,将url所对应的图片下载到本地
     * 
     * @param urlList
     */
    public static void downloadPicture(List urlList, List names) {
        String baseDir = "E:\\spider\\";
        URL url = null;

        for (int i = 0; i < urlList.size(); i++) {
            try {
                url = new URL(urlList.get(i));
                DataInputStream dataInputStream = new DataInputStream(url.openStream());
                FileOutputStream fileOutputStream = new FileOutputStream(new File(baseDir + names.get(i)));

                byte[] buffer = new byte[1024 * 50];
                int length;

                while ((length = dataInputStream.read(buffer)) > 0) {
                    fileOutputStream.write(buffer, 0, length);
                }
                System.out.println("已经下载:" + baseDir + names.get(i));
                dataInputStream.close();
                fileOutputStream.close();
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

baseDir 设置下载位置即可

你可能感兴趣的:(爬虫)