京东简单爬虫

先找到视频的请求url,发现url中vid(猜测是视频参数)是要进行传参的,
在代码检查中查找vid,找到参数vid。
提取参数vid,拼接url,然后模拟请求。
即可进行视频下载

package com.example.shares.utils;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.io.FileUtils;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawlingJinDong {
    //获取String类型页面
    public static String getHtmlStr(String url) throws IOException {
        WebClient webClient = new WebClient();
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(false);
        webClient.getOptions().setUseInsecureSSL(true);
        HtmlPage htmlPage = webClient.getPage(url);
        String html =htmlPage.asXml();
        webClient.close();
        return html;
    }

    //写入文件
    public static void writerIntoFile(String html,String path) throws IOException {
       //新建文件,并放入文件写入流中
        File  file = new File(path);
        FileWriter fileWriter = new FileWriter(file);
        //把String类型的html写入文件写入流中
        fileWriter.write(html);
        fileWriter.close();
    }

    //读出文件
    public static String readFile(String html,String path) throws IOException {
        //通过缓冲取读取文件读入流中的文件
        BufferedReader bufferedReader = new BufferedReader(new FileReader(path));
        StringBuffer sb = new StringBuffer();
        //将缓冲区的文件读出到字符串中
        String contentLine = bufferedReader.readLine();
        while(contentLine!=null){
            sb.append(contentLine);
            sb.append("\r\n");
            contentLine=bufferedReader.readLine();
        }
        bufferedReader.close();
        return sb.toString();//转为字符串输出
    }


    public static void main(String args[]) throws IOException {
        String url="https://item.jd.com/100003311437.html";
        String htmlFilePath = "D:/picture/jidong/taideng.html";
        String resourcesFile = "D:/picture/jidong/";
        String html = getHtmlStr(url);
        writerIntoFile(html,htmlFilePath);
        String html1 = readFile(html,htmlFilePath);

        System.out.println("======================  开始爬取  ========================");

        //获取页面标题
        String str1="
(.+?)
"
;//正则表达式 Pattern pattern = Pattern.compile(str1,Pattern.DOTALL); Matcher matcher = pattern.matcher(html1); while(matcher.find()){ System.out.println(matcher.group(1).replaceFirst("","").trim()); } //获取视频 //"infoVideoId":"126016285","mainVideoId":"99177800" //先查找请求中的响应VideoId String str2 ="infoVideoId\":\"(\\d+?)\",\"mainVideoId\":\"(\\d+?)\""; Pattern pattern1 = Pattern.compile(str2); Matcher matcher1 = pattern1.matcher(html1); String infoVideoId=null,mainVideoId=null; while(matcher1.find()) { infoVideoId = matcher1.group(1); mainVideoId = matcher1.group(2); } System.out.println("infoVideoId:"+infoVideoId+" \nmainVideoId:"+mainVideoId); //进行请求模拟,获取请求响应头 String infoStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v2?vid="+infoVideoId,"utf-8"); String mainStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v3?vid="+mainVideoId,"utf-8"); System.out.println("请求响应:"+"infoStr"+infoStr+"\n"+mainStr); //在请求响应头中查找MP4对应的url,并存入list中 String str3="\"playUrl\":\"(.+?)\""; Pattern pattern2 = Pattern.compile(str3); Matcher matcher2 = pattern2.matcher(infoStr); Matcher matcher3 = pattern2.matcher(mainStr); List<String> list = new ArrayList<>(); while(matcher2.find()) { list.add(matcher2.group(1)); System.out.println("matcher2"+matcher2.group(1)); } while(matcher3.find()) { list.add(matcher3.group(1)); System.out.println("matcher3"+matcher3.group(1)); } //开始准备视频下载 System.out.println("------------------------开始准备下载----------------------"); for(int i=0;i<list.size();i++){ System.out.print("正在下载 ====="+list.get(i)); URL urlMp4 = new URL(list.get(i)); URLConnection con = urlMp4.openConnection(); con.setConnectTimeout(10*1000); InputStream inputStream = con.getInputStream(); FileUtils.copyInputStreamToFile(inputStream, new File(resourcesFile + i + ".mp4")); } } }

你可能感兴趣的:(java爬虫)