爬取网站http://image.so.com/ 360的图片搜索
先随便输入搜索的图片内容进入网页,发现他的加载是拖动加载图片,浏览器按F12打开调试模式,选择网络模式抓包
抓到后点击查看,先查看请求路径与请求方式
然后翻到最后看请求参数,get请求也可以直接在url地址上查看请求参数
最后分析返回的json参数,拿到所需要的资源
也可以通过https://www.bejson.com/ JSON转换器查看JSON串获取内容
如果不确定资源是否正确可以复制图片连接在浏览器上打开
好了,XJB分析之后上代码:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.dagroupId>
<artifactId>Spider1artifactId>
<version>0.0.1-SNAPSHOTversion>
<packaging>warpackaging>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.3version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.31version>
dependency>
dependencies>
project>
依赖文件只需要json解析的工具和httpclient即可
主程序:
package com.da.test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.da.util.HttpClientUtil;
import com.da.util.UrlFileDownloadUtil;
public class Demo1 {
public static void main(String[] args) {
String url = "http://image.so.com/j";
Map param = new HashMap();
param.put("q", "蘑菇头表情包");
param.put("src", "srp");
param.put("correct", "蘑菇头表情包");
param.put("pn", "60");
param.put("sn", "0");
String string = HttpClientUtil.doGet(url, param);
// System.out.println(string);
JSONArray jsonArray = JSON.parseArray(JSON.parseObject(string).get("list").toString());
// System.out.println(jsonArray.size());
List urls = new ArrayList<>();
List names = new ArrayList<>();
for (int i = 0; i < jsonArray.size(); i++) {
String href = jsonArray.getJSONObject(i).get("thumb").toString();
urls.add(href);
String id = jsonArray.getJSONObject(i).get("id").toString();
String type = jsonArray.getJSONObject(i).get("imgtype").toString();
if (type.equalsIgnoreCase("other")) {
type = href.substring(href.lastIndexOf(".") + 1);
}
names.add(id + "." + type.toLowerCase());
}
UrlFileDownloadUtil.downloadPicture(urls, names);
}
}
q与content是你需要搜索的内容,pn是返回多少数据
工具类:
package com.da.util;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class HttpClientUtil {
private static PoolingHttpClientConnectionManager connMgr;
private static RequestConfig requestConfig;
private static final int MAX_TIMEOUT = 5000;
static {
// 设置连接池
connMgr = new PoolingHttpClientConnectionManager();
// 设置连接池大小
connMgr.setMaxTotal(200);
connMgr.setDefaultMaxPerRoute(connMgr.getMaxTotal());
RequestConfig.Builder configBuilder = RequestConfig.custom();
// 设置连接超时
configBuilder.setConnectTimeout(MAX_TIMEOUT);
// 设置读取超时
configBuilder.setSocketTimeout(MAX_TIMEOUT);
// 设置从连接池获取连接实例的超时
configBuilder.setConnectionRequestTimeout(MAX_TIMEOUT);
// 在提交请求之前 测试连接是否可用
// configBuilder.setStaleConnectionCheckEnabled(true);
// 设置代理
// configBuilder.setProxy(new HttpHost("119.249.48.235", 80));
requestConfig = configBuilder.build();
}
public static String doGet(String url, Map param) {
// 创建Httpclient对象
CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connMgr).build();
String resultString = "";
CloseableHttpResponse response = null;
try {
// 创建uri
URIBuilder builder = new URIBuilder(url);
if (param != null) {
for (String key : param.keySet()) {
builder.addParameter(key, param.get(key));
}
}
URI uri = builder.build();
// 创建http GET请求
HttpGet httpGet = new HttpGet(uri);
httpGet.setConfig(requestConfig);
// 执行请求
response = httpclient.execute(httpGet);
// 判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == 200) {
resultString = EntityUtils.toString(response.getEntity(), "UTF-8");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
response.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return resultString;
}
public static String doGet(String url) {
return doGet(url, null);
}
public static String doPost(String url, Map param) {
// 创建Httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connMgr).build();
CloseableHttpResponse response = null;
String resultString = "";
try {
// 创建Http Post请求
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
// 创建参数列表
if (param != null) {
List paramList = new ArrayList<>();
for (String key : param.keySet()) {
paramList.add(new BasicNameValuePair(key, param.get(key)));
}
// 模拟表单
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
httpPost.setEntity(entity);
}
// 执行http请求
response = httpClient.execute(httpPost);
resultString = EntityUtils.toString(response.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return resultString;
}
public static String doPost(String url) {
return doPost(url, null);
}
public static String doPostJson(String url, String json) {
// 创建Httpclient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connMgr).build();
CloseableHttpResponse response = null;
String resultString = "";
try {
// 创建Http Post请求
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
// 创建请求内容
StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
httpPost.setEntity(entity);
// 执行http请求
response = httpClient.execute(httpPost);
resultString = EntityUtils.toString(response.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return resultString;
}
}
httpclient的轮子网上随便找的,大把没必要自己去写
package com.da.util;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
public class UrlFileDownloadUtil {
/**
* 传入要下载的图片的url列表,将url所对应的图片下载到本地
*
* @param urlList
*/
public static void downloadPicture(List urlList, List names) {
String baseDir = "E:\\spider\\";
URL url = null;
for (int i = 0; i < urlList.size(); i++) {
try {
url = new URL(urlList.get(i));
DataInputStream dataInputStream = new DataInputStream(url.openStream());
FileOutputStream fileOutputStream = new FileOutputStream(new File(baseDir + names.get(i)));
byte[] buffer = new byte[1024 * 50];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
fileOutputStream.write(buffer, 0, length);
}
System.out.println("已经下载:" + baseDir + names.get(i));
dataInputStream.close();
fileOutputStream.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
baseDir 设置下载位置即可