Jsoup——抖音视频抓取(一)

楔子

好奇害死猫。搞了一夜

抓取抖音小视频

工具

Fiddler 和 安卓手机。关于Fiddler对安卓应用手机抓包图文教程 自行网上说的明白清楚

1:获取个人关注

Jsoup——抖音视频抓取(一)_第1张图片
上图的地址是https://api.amemv.com/aweme/v1/user/following/list/?user_id=93267622308&max_time=1522535483&count=20&retry_type=no_retry&iid=29648784234&device_id=41459906457&ac=wifi&channel=meizu&aid=1128&app_name=aweme&version_code=179&version_name=1.7.9&device_platform=android&ssmix=a&device_type=m1+metal&device_brand=Meizu&language=zh&os_api=22&os_version=5.1&uuid=869014028487941&openudid=a60b54dc77755f2f&manifest_version_code=179&resolution=1080*1920&dpi=480&update_version_code=1792&_rticket=1522535483501&ts=1522535482&as=a105b0fcfaa35aacf04715&cp=0535a754a50dc7cae1ihza&mas=008b2f7d28109a62d85dfa44f97aa3f6daac8cec2c0c66c68686ac

Jsoup——抖音视频抓取(一)_第2张图片
每位用户信息多。此处只是关注 需要的。

个人关注信息列表

package cn.zhuzi.douyin;

import java.io.IOException;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import cn.zhuzi.douyin.bean.FollPerson;
import cn.zhuzi.douyin.bean.MyCareBean;

import com.alibaba.fastjson.JSON;

/**
 * 关注列表
 * 
 * @author grq
 *
 */
public class MyCare {
    /**
     * 我关注 这个地址是动态变化的额,此时采集一次 就已文本的形式保存 关注人信息
     */
    static String care_url = "https://api.amemv.com/aweme/v1/user/following/list/?user_id=93267622308&max_time=1522542279&count=20&retry_type=no_retry&iid=29648784234&device_id=41459906457&ac=wifi&channel=meizu&aid=1128&app_name=aweme&version_code=179&version_name=1.7.9&device_platform=android&ssmix=a&device_type=m1+metal&device_brand=Meizu&language=zh&os_api=22&os_version=5.1&uuid=869014028487941&openudid=a60b54dc77755f2f&manifest_version_code=179&resolution=1080*1920&dpi=480&update_version_code=1792&_rticket=1522542279863&ts=1522542279&as=a165723c573c4a96706506&cp=2dc7a95e7205cf62e1btnv&mas=006ce86afb4332bfcd460be343eb756f146c0cac6c0cec6c9c862c";

    public static void getMyCare() {
        try {

            // 缺少 ignoreContentType(true) 会报错
            // UnsupportedMimeTypeException: Unhandled content type. Must be
            // text/*, application/xml,
            Document document = Jsoup.connect(care_url).ignoreContentType(true).get();
            String careStr = document.body().html();
            System.out.println(careStr);
            MyCareBean myCare = JSON.parseObject(careStr, MyCareBean.class);
            FollPerson[] followings = myCare.getFollowings();
            System.out.println(JSON.toJSONString(followings));
            for (FollPerson follPerson : followings) {
                System.out.println(follPerson.getNickname());
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args) {
        System.out.println(System.currentTimeMillis());
        getMyCare();
    }

}

2:获取主页

主页信息是 https://www.douyin.com/share/user/+uid (如下图)
Jsoup——抖音视频抓取(一)_第3张图片
Jsoup——抖音视频抓取(一)_第4张图片
Jsoup——抖音视频抓取(一)_第5张图片

上图恰好和主页列表数量一样,推测 主页列表地址是 https://www.douyin.com/aweme/v1/aweme/post/?user_id=58900737309&count=21&max_cursor=0&aid=1128

2.1:获取主页作品信息

Jsoup——抖音视频抓取(一)_第6张图片

package cn.zhuzi.douyin;

import java.io.IOException;
import java.util.List;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import cn.zhuzi.douyin.bean.FollPerson;
import cn.zhuzi.douyin.bean.MyCareBean;
import cn.zhuzi.douyin.bean.PserWork;
import cn.zhuzi.douyin.bean.WorkList;

import com.alibaba.fastjson.JSON;

public class CarePerson {
    /**
     * 个人主页。忘记怎么找到了了 https://www.douyin.com/share/user/58900737309
     */

    public static String perHost = "https://www.douyin.com/aweme/v1/aweme/post/?user_id=";
    public static String fexHost = "&count=21&max_cursor=0&aid=1128";

    public MyCareBean myCareBean;

    public String getPerHost() {
        return perHost;
    }

    public void setPerHost(String perHost) {
        this.perHost = perHost;
    }

    public MyCareBean getMyCareBean() {
        return myCareBean;
    }

    public void setMyCareBean(MyCareBean myCareBean) {
        this.myCareBean = myCareBean;
    }

    public static String getBaseUrlForPer() {
        return baseUrlForPer;
    }

    public static void setBaseUrlForPer(String baseUrlForPer) {
        CarePerson.baseUrlForPer = baseUrlForPer;
    }

    /**
     * 个人主页基本路径
     * https://www.douyin.com/aweme/v1/aweme/post/?user_id=58900737309&count
     * =21&max_cursor=0&aid=1128
     */
    public static String baseUrlForPer = "https://www.douyin.com/share/user/";

    public void getPserWork() {
        FollPerson[] followings = myCareBean.getFollowings();

        String uid = followings[0].getUid();
        Connection connect = Jsoup.connect(baseUrlForPer + uid);

        Document document;
        try {
            document = connect.ignoreContentType(true).get();

            System.err.println(document.body().html());
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    public static void main(String[] args) {
        String mycatString = "[{'nickname':'一珺、','uid':'52616983119'},{'nickname':'Imperia_小然然','uid':'61141281259'},{'nickname':'光哥','uid':'58900737309'}]";
        List parseArray = JSON.parseArray(mycatString, FollPerson.class);

        String url = CarePerson.perHost + parseArray.get(0).getUid() + CarePerson.fexHost;
        Connection connect = Jsoup.connect(url);
        Document document;
        try {
            document = connect.ignoreContentType(true).get();
            String html = document.body().html();
            PserWork parsework = JSON.parseObject(html, PserWork.class);
            List aweme_list = parsework.getAweme_list();
            for (WorkList workList : aweme_list) {
                System.out.println(JSON.toJSON(workList));
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

3:根据URL下载具体的内容

此处以 https://www.douyin.com/share/video/6536877257548369155为例子

代码如下

package cn.zhuzi.douyin;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import cn.zhuzi.douyin.bean.Mp4Parent;

import com.alibaba.fastjson.JSON;

public class DownFromDetailUrl {

    public static void main(String[] args) {
        try {
            String url = "https://www.douyin.com/share/video/6536877257548369155";
            Connection connect = Jsoup.connect(url);
            Document pageContext = connect.followRedirects(true).get();
            Elements elementsByTag = pageContext.getElementsByTag("script");
            String eleStr = elementsByTag.get(elementsByTag.size() - 1).toString();
            System.out.println(eleStr.indexOf("[{"));
            System.out.println(eleStr.lastIndexOf("}]"));
            String mp4urlText = eleStr.subSequence(eleStr.indexOf("[{") + 1, eleStr.lastIndexOf("}]") + 1).toString();

            Mp4Parent mp4Parent = JSON.parseObject(mp4urlText, Mp4Parent.class);
            String string = mp4Parent.getVideo().getPlay_addr().getUrl_list().get(0);
            System.out.println(string);
            // 得到url
            // https://aweme.snssdk.com/aweme/v1/playwm/?video_id=380e29ed5af54d22896d933c81980c31&line=0

            /**
             * 下载视频
             */
            Response document = Jsoup.connect(string).ignoreContentType(true).timeout(8000).execute();
            BufferedInputStream stream = document.bodyStream();
            FileUtils.copyInputStreamToFile(stream, new File("c://heheda.mp4"));

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

后记

之前看过一个笑话强扭的瓜不甜。没关系,扭下来我就开心 。不知道为何为了这个没用的搞了一夜。但是搞完了还是有点开心。明天整理代码。
Jsoup——抖音视频抓取(一)_第7张图片

你可能感兴趣的:(java,jsoup)