JAVA 爬取京东评论和图片

pom.xml



  4.0.0

  com.spider
  spider
  1.0-SNAPSHOT

  spider
  
  http://www.example.com
  
    UTF-8
    1.7
    1.7
  

  
    
      junit
      junit
      4.11
      test
    
    
      org.apache.httpcomponents
      httpclient
      4.5.2
    
    
      org.jsoup
      jsoup
      1.7.3
    
    
      org.junit.jupiter
      junit-jupiter-api
      5.0.3
      compile
    
    
    
      net.sf.json-lib
      json-lib
      2.4
    
    
    
      com.google.code.gson
      gson
      2.8.0
    
    
      com.alibaba
      fastjson
      1.2.53
    

    
      com.spider
      spider
      1.0-SNAPSHOT
    


  

  
    
      
        
          maven-clean-plugin
          3.0.0
        
        
        
          maven-resources-plugin
          3.0.2
        
        
          maven-compiler-plugin
          3.7.0
        
        
          maven-surefire-plugin
          2.20.1
        
        
          maven-jar-plugin
          3.0.2
        
        
          maven-install-plugin
          2.5.2
        
        
          maven-deploy-plugin
          2.8.2
        
      
    
    
      
        org.apache.maven.plugins
        maven-compiler-plugin
        
          6
          6
        
      
    
  

代码:

package com.spider;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.net.URL;
import java.util.Date;
/**
 * @author 赵鑫
 * @Time: 2018/7/14
 * @Email:[email protected]
 */
public class JD {
    public static void main(String[] args) throws Exception{
        for (int i=0;i<3;i++){
        BufferedWriter bw1=new BufferedWriter(new FileWriter("spider/comment/第"+(i+1)+"页评论内容"+".txt"));
        BufferedWriter bw2=new BufferedWriter(new FileWriter("第"+(i+1)+"页评论图片连接"+".txt"));
//     1.用Jsoup解析网页
        String url="https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv73104&productId=7437788&score=0&sortType=5&page="+i+"&pageSize=10&isShadowSku=0&fold=1";
        CloseableHttpResponse indexRes = sendGet(url);
//        获取json内容,将其转换为字符串
        String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
//        截取成json字符串
        String json2=indexHtml.substring(indexHtml.indexOf('(')+1,indexHtml.lastIndexOf(')'));
//        获取评论
        JSONArray array = JSON.parseObject(json2).getJSONArray("comments");
        for (Object item : array) {
            //获取评论中的内容
            System.out.println(JSON.parseObject(item.toString()).getString("content"));
            bw1.write(JSON.parseObject(item.toString()).getString("content"));
            JSONArray array1 = JSON.parseObject(item.toString()).getJSONArray("images");
            System.out.println(array1.size());
            for (Object item1 : array1) {
                String s=JSON.parseObject(item1.toString()).getString("imgUrl");
                System.out.println(s);
                Download("http:"+s);
                bw2.write(JSON.parseObject(item1.toString()).getString("imgUrl")+"\n");
            }
        }
        bw1.close();
        bw2.close();
        }
    }
    //发送get请求,获取响应结果
    public static CloseableHttpResponse sendGet(String url) throws IOException {
        //创建httpClient客户端
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //创建请求对象,发送请求
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36");
        httpGet.setHeader("Cookie", "__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; unpl=V2_ZzNtbUBTFkV9XxUDLEkMA2IGQVpLBBRAd19ABi4QWgIwVkZZclRCFXwURlRnGloUZAEZWUVcQBRFCEdkexhdBGYBGlhLVXNILGYFAX5SCQBXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543032744.1543334183.4; __jdc=122270672; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd4zd4|tuiguang|35da9fbffaa744b68bfd3f7cd876fde5|1543334183097; PCSYCityID=412; _gcl_au=1.1.1618589019.1543334587; wlfstk_smdl=kpyxn7dgfu7ntzeriqf1nuoyf1pvmmz6; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; logintype=wx; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; npin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; pinId=KCcQw4HqqMKLC3rBXJmYjQ; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sB2NQRwVbUAJGSkkcCBliBBdXQVECWB9VS19SblEUWlkMB1tKeRpdBW4fElJBW1tLHkgSXAxsBhBiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; JSESSIONID=3AD0CE01D03F107D0B4F45BED45F806D.s1; shshshsID=52730810bd55258389660fdba736586f_15_1543337522651; __jdb=122270672.17.1011906297|4.1543334183; thor=__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; PCSYCityID=412; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; jwotest_product=99; unpl=V2_ZzNtbRVUQBAmD0EGexlVBmJQQlsSBEQUcQxCXHxKXVFnBEFZclRCFXwURlRnGlQUZwEZXkJcRhJFCEdkexhdBGYBGlhLVXNILGYFAX1BDFlXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543342650.1543440108.7; __jdc=122270672; thor=EBE4F58722D3C53DD96909AC59EFA6D1BC94658FBAC118C866693EA1EBF5169985038D47FF0615A96C39E195C704E4269C0AE2B142F2A6CF58BFF0E3C588B282CFE4B6DB95B893DBB7528C8A117BF09C2BB8B1A6955DEA1B2D00A191464B5CC90B094977CD8D55F54EAE17D856F65E1A4577319BD2627227472617F7462C4E24; pinId=KCcQw4HqqMKLC3rBXJmYjQ; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd7iam|tuiguang|f224b66c11824ba7ab6055596b0e16b5|1543440132756; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sUWJRElFbDAFGHkgRXRliAxMCQVFSXEtVGl8GYFYbWloIUwkceRpdBW4fElJBW1pLH0sSXwZsABRiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; _gcl_au=1.1.816281936.1543440149; shshshsID=d687c8924c94ae61b9d19c6054056eef_4_1543440174304; __jdb=122270672.7.1011906297|7.1543440108");
        httpGet.setHeader("Connection", "keep-alive");
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
    //下载图片
    private static void Download(String listImgSrc) throws Exception {
        try {
            String url=listImgSrc;
            String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
            URL uri = new URL(url);
            InputStream in = uri.openStream();
            FileOutputStream fo = new FileOutputStream(new File("spider/img/"+imageName));//文件输出流
            byte[] buf = new byte[1024];
            int length = 0;
            System.out.println("开始下载:" + url);
            while ((length = in.read(buf, 0, buf.length)) != -1) {
                fo.write(buf, 0, length);
            }
            in.close();
            fo.close();
            System.out.println(imageName + "下载完成");
            Date overdate = new Date();
        } catch (Exception e) {
            System.out.println("下载失败");
        }
    }
}

 

你可能感兴趣的:(JAVA 爬取京东评论和图片)