pom.xml
4.0.0
com.spider
spider
1.0-SNAPSHOT
spider
http://www.example.com
UTF-8
1.7
1.7
junit
junit
4.11
test
org.apache.httpcomponents
httpclient
4.5.2
org.jsoup
jsoup
1.7.3
org.junit.jupiter
junit-jupiter-api
5.0.3
compile
net.sf.json-lib
json-lib
2.4
com.google.code.gson
gson
2.8.0
com.alibaba
fastjson
1.2.53
com.spider
spider
1.0-SNAPSHOT
maven-clean-plugin
3.0.0
maven-resources-plugin
3.0.2
maven-compiler-plugin
3.7.0
maven-surefire-plugin
2.20.1
maven-jar-plugin
3.0.2
maven-install-plugin
2.5.2
maven-deploy-plugin
2.8.2
org.apache.maven.plugins
maven-compiler-plugin
6
代码:
package com.spider;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.net.URL;
import java.util.Date;
/**
* @author 赵鑫
* @Time: 2018/7/14
* @Email:[email protected]
*/
public class JD {
public static void main(String[] args) throws Exception{
for (int i=0;i<3;i++){
BufferedWriter bw1=new BufferedWriter(new FileWriter("spider/comment/第"+(i+1)+"页评论内容"+".txt"));
BufferedWriter bw2=new BufferedWriter(new FileWriter("第"+(i+1)+"页评论图片连接"+".txt"));
// 1.用Jsoup解析网页
String url="https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv73104&productId=7437788&score=0&sortType=5&page="+i+"&pageSize=10&isShadowSku=0&fold=1";
CloseableHttpResponse indexRes = sendGet(url);
// 获取json内容,将其转换为字符串
String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
// 截取成json字符串
String json2=indexHtml.substring(indexHtml.indexOf('(')+1,indexHtml.lastIndexOf(')'));
// 获取评论
JSONArray array = JSON.parseObject(json2).getJSONArray("comments");
for (Object item : array) {
//获取评论中的内容
System.out.println(JSON.parseObject(item.toString()).getString("content"));
bw1.write(JSON.parseObject(item.toString()).getString("content"));
JSONArray array1 = JSON.parseObject(item.toString()).getJSONArray("images");
System.out.println(array1.size());
for (Object item1 : array1) {
String s=JSON.parseObject(item1.toString()).getString("imgUrl");
System.out.println(s);
Download("http:"+s);
bw2.write(JSON.parseObject(item1.toString()).getString("imgUrl")+"\n");
}
}
bw1.close();
bw2.close();
}
}
//发送get请求,获取响应结果
public static CloseableHttpResponse sendGet(String url) throws IOException {
//创建httpClient客户端
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建请求对象,发送请求
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36");
httpGet.setHeader("Cookie", "__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; unpl=V2_ZzNtbUBTFkV9XxUDLEkMA2IGQVpLBBRAd19ABi4QWgIwVkZZclRCFXwURlRnGloUZAEZWUVcQBRFCEdkexhdBGYBGlhLVXNILGYFAX5SCQBXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543032744.1543334183.4; __jdc=122270672; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd4zd4|tuiguang|35da9fbffaa744b68bfd3f7cd876fde5|1543334183097; PCSYCityID=412; _gcl_au=1.1.1618589019.1543334587; wlfstk_smdl=kpyxn7dgfu7ntzeriqf1nuoyf1pvmmz6; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; logintype=wx; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; npin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; pinId=KCcQw4HqqMKLC3rBXJmYjQ; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sB2NQRwVbUAJGSkkcCBliBBdXQVECWB9VS19SblEUWlkMB1tKeRpdBW4fElJBW1tLHkgSXAxsBhBiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; JSESSIONID=3AD0CE01D03F107D0B4F45BED45F806D.s1; shshshsID=52730810bd55258389660fdba736586f_15_1543337522651; __jdb=122270672.17.1011906297|4.1543334183; thor=__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; PCSYCityID=412; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; jwotest_product=99; unpl=V2_ZzNtbRVUQBAmD0EGexlVBmJQQlsSBEQUcQxCXHxKXVFnBEFZclRCFXwURlRnGlQUZwEZXkJcRhJFCEdkexhdBGYBGlhLVXNILGYFAX1BDFlXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543342650.1543440108.7; __jdc=122270672; thor=EBE4F58722D3C53DD96909AC59EFA6D1BC94658FBAC118C866693EA1EBF5169985038D47FF0615A96C39E195C704E4269C0AE2B142F2A6CF58BFF0E3C588B282CFE4B6DB95B893DBB7528C8A117BF09C2BB8B1A6955DEA1B2D00A191464B5CC90B094977CD8D55F54EAE17D856F65E1A4577319BD2627227472617F7462C4E24; pinId=KCcQw4HqqMKLC3rBXJmYjQ; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd7iam|tuiguang|f224b66c11824ba7ab6055596b0e16b5|1543440132756; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sUWJRElFbDAFGHkgRXRliAxMCQVFSXEtVGl8GYFYbWloIUwkceRpdBW4fElJBW1pLH0sSXwZsABRiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; _gcl_au=1.1.816281936.1543440149; shshshsID=d687c8924c94ae61b9d19c6054056eef_4_1543440174304; __jdb=122270672.7.1011906297|7.1543440108");
httpGet.setHeader("Connection", "keep-alive");
CloseableHttpResponse response = httpClient.execute(httpGet);
return response;
}
//下载图片
private static void Download(String listImgSrc) throws Exception {
try {
String url=listImgSrc;
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File("spider/img/"+imageName));//文件输出流
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println(imageName + "下载完成");
Date overdate = new Date();
} catch (Exception e) {
System.out.println("下载失败");
}
}
}