【阿楠的blog】抖音用户信息爬取和FFmpeg常用命令总结

一、最近在尝试抓抖音用户数据，抓到了一个share_url（下面附有代码），抓取的过程中发现抖音的账号、点赞数、作品数都是以字符集的形式显示，目的是防止爬虫抓取，细节，细节，细节（重要的事情说三遍）字符集的形式后面文章会讲。

数据抓取

访问过去看了下，
https://www.iesdouyin.com/sha...

有数据的。那就拿数据。

然后发现它对数字做了字符集映射。

搞他。

下载.ttf的文件，s3a.bytecdn.cn/ies/resource/falcon/douyin_falcon/static/font/iconfont_da2e2ef.ttf，这个便用font creator软件打开，看到这个图片我们就明白了字体与数字的关系

既然我们看到num_对应数字1，num_8对应数字7，那这个num_8们怎么得到，与‘ ’有啥关系？

2.这个时候，需要大家安装pip install fontTools,使用fontTool打开ttf文件转化成xml文件，

采用下面代码

from fontTools.ttLib import TTFont
font_1 = TTFont('douyin.ttf')
font_1.saveXML('font_1.xml')

我们看到了font_1.xml，

这就能看出来了

import re
def getDouyinNum(douIDNumCode):
    mapCode2Font = {
        '0xe602': 'num_',
        '0xe603': 'num_1',
        '0xe604': 'num_2',
        '0xe605': 'num_3',
        '0xe606': 'num_4',
        '0xe607': 'num_5',
        '0xe608': 'num_6',
        '0xe609': 'num_7',
        '0xe60a': 'num_8',
        '0xe60b': 'num_9',
        '0xe60c': 'num_4',
        '0xe60d': 'num_1',
        '0xe60e': 'num_',
        '0xe60f': 'num_5',
        '0xe610': 'num_3',
        '0xe611': 'num_2',
        '0xe612': 'num_6',
        '0xe613': 'num_8',
        '0xe614': 'num_9',
        '0xe615': 'num_7',
        '0xe616': 'num_1',
        '0xe617': 'num_3',
        '0xe618': 'num_',
        '0xe619': 'num_4',
        '0xe61a': 'num_2',
        '0xe61b': 'num_5',
        '0xe61c': 'num_8',
        '0xe61d': 'num_9',
        '0xe61e': 'num_7',
        '0xe61f': 'num_6',
    }
    mapFont2Num = {
        'num_': 1,
        'num_1': 0,
        'num_2': 3,
        'num_3': 2,
        'num_4': 4,
        'num_5': 5,
        'num_6': 6,
        'num_7': 9,
        'num_8': 7,
        'num_9': 8,
    }
    douIDNum = ''
    map1 = {}
    if douIDNumCode == '':
        return ''
    for i in douIDNumCode:
        j = i.replace(' &#', '0').replace('; ', '')
        map1[j] = str(mapFont2Num[mapCode2Font[j]])
    return map1

def req(share_id):
    url2= 'https://www.iesdouyin.com/share/user/{}'.format(share_id)
    headers ={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/74.0.3729.169 Safari/537.36"
            }
    res=requests.get(url2,verify=False,headers=headers)
    return res

def parse(res):
    description_header = re.findall('抖音ID：     (.*?)   ',res.text,re.S)
    description_back = re.findall('(.*?)',res.text,re.S)
    mapDict = getDouyinNum(douIDNumCode=description_back)
    res = description_header[0]
    for i,j in mapDict.items():
        res = res.replace(' &#'+i[1:]+'; ',j)
    print(res)


if __name__ == '__main__':
    res =  req(102064772608)
    parse(res)

import re
import requests
from lxml import etree
'''
                         抖音用户基本信息 -> 请求share来获取数据 
'''

def handle_decode(input_data):
    # 匹配icon font
    regex_list = [
        {'name': ['  ', '  ', '  '], 'value': 0},
        {'name': ['  ', '  ', '  '], 'value': 1},
        {'name': ['  ', '  ', '  '], 'value': 2},
        {'name': ['  ', '  ', '  '], 'value': 3},
        {'name': ['  ', '  ', '  '], 'value': 4},
        {'name': ['  ', '  ', '  '], 'value': 5},
        {'name': ['  ', '  ', '  '], 'value': 6},
        {'name': ['  ', '  ', '  '], 'value': 7},
        {'name': ['  ', '  ', '  '], 'value': 8},
        {'name': ['  ', '  ', '  '], 'value': 9},
    ]

    for i1 in regex_list:
        for i2 in i1['name']:
            input_data = re.sub(i2, str(i1['value']), input_data)       # 把正确value替换到自定义字体上

    html = etree.HTML(input_data)
    douyin_info = {}
    # 获取昵称
    douyin_info['nick_name'] = html.xpath("//div[@class='personal-card']/div[@class='info1']//p[@class='nickname']/text()")[0]
    # 获取抖音ID
    douyin_id = html.xpath("//div[@class='personal-card']/div[@class='info1']/p[@class='shortid']//text()")
    douyin_info['douyin_id'] = ''.join(douyin_id).replace('抖音ID：', '').replace(' ', '')

    # 职位类型
    try:
        douyin_info['job'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/div[@class='verify-info']/span[@class='info']/text()")[0].strip()
    except:
        pass
    # 描述
    douyin_info['describe'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='signature']/text()")[0].replace('\n', ',')
    # 关注
    douyin_info['follow_count'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='focus block']//i[@class='icon iconfont follow-num']/text()")[0].strip()
    # 粉丝
    fans_value = ''.join(html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='follower block']//i[@class='icon iconfont follow-num']/text()"))
    unit = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='follower block']/span[@class='num']/text()")
    if unit[-1].strip() == 'w':
        douyin_info['fans'] = str(float(fans_value) / 10) + 'w'
    else:
        douyin_info['fans'] = fans_value
    # 点赞
    like = ''.join(html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='liked-num block']//i[@class='icon iconfont follow-num']/text()"))
    unit = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='liked-num block']/span[@class='num']/text()")
    if unit[-1].strip() == 'w':
        douyin_info['like'] = str(float(like) / 10) + 'w'
    else:
        douyin_info['like'] = like

    return douyin_info


def handle_douyin_info(url):
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    }
    response = requests.get(url=url, headers=header)
    return handle_decode(response.text)

if __name__ == '__main__':
    url = 'https://www.iesdouyin.com/share/user/102064772608'
    print(handle_douyin_info(url))

反爬虫字体混淆

代码例子, 猫眼的字体反爬虫已经升级了, 不再是简单的顺序关系, 这样取到的值依然是错的. 需要根据根据各个字体里面的字体定义TTGlyph.contour的值去判断具体数字.

import requests
import re
import os
from fontTools.ttLib import TTFont
 
 
class MaoYan(object):
    def __init__(self):
        self.url = 'https://maoyan.com/films/42964'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }
 
    # 发送请求获得响应
    def get_html(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content
 
    # 创建 self.font 属性
    def create_font(self, font_file):
        # 列出已下载文件
        file_list = os.listdir('./fonts')
        # 判断是否已下载
        if font_file not in file_list:
            # 未下载则下载新库
            print('不在字体库中, 下载:', font_file)
            url = 'http://vfile.meituan.net/colorstone/' + font_file
            new_file = self.get_html(url)
            with open('./fonts/' + font_file, 'wb') as f:
                f.write(new_file)
 
        # 打开字体文件，创建 self.font属性
        self.font = TTFont('./fonts/' + font_file)
        self.font.saveXML('./fonts/' + font_file + '.xml')
 
    # 把获取到的数据用字体对应起来，得到真实数据
    def modify_data(self, data):
        print(data);
        # 获取 GlyphOrder 节点
        gly_list = self.font.getGlyphOrder()
        # 前两个不是需要的值，截掉
        gly_list = gly_list[2:]
        # 枚举, number是下标，正好对应真实的数字，gly是乱码
        for number, gly in enumerate(gly_list):
            # 把 gly 改成网页中的格式
            gly = gly.replace('uni', '&#x').lower() + ';'
            # 如果 gly 在字符串中，用对应数字替换
            if gly in data:
                data = data.replace(gly, str(number))
        # 返回替换后的字符串
        return data
 
    def start_crawl(self):
        html = self.get_html(self.url).decode('utf-8')
 
        # 正则匹配字体文件
        font_file = re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', html)[0]
        print(font_file);
        self.create_font(font_file)
 
        # 正则匹配星级
        star = re.findall(r'\s+(.*?)\s+', html)[0]
        star = self.modify_data(star)
 
        # 正则匹配评论的人数
        people = ''.join(re.findall(r'''(.*?万)(人评分)''', html)[0])
        people = self.modify_data(people)
 
        # 正则匹配累计票房
        ticket_number = ''.join(re.findall(r'''(.*?)(亿)''', html)[0])
        ticket_number = self.modify_data(ticket_number)
 
        print('用户评分: %s 星' % star)
        print('评分人数: %s' % people)
        print('累计票房: %s' % ticket_number)
 
 
if __name__ == '__main__':
    maoyan = MaoYan()
    maoyan.start_crawl()

二、FFmpeg

熟练使用FFmpeg的常用命令，可以验证音视频处理的中间数据转换过程是否正确。以下是平时使用的一些命令总结。

视频

MP4转H264

ffmpeg -i input.mp4 -c:v libx264 -profile:v baseline output.h264

H264转MP4，缩放

ffmpeg -i input.h264 -c:v libx264 -profile:v baseline -vf scale=640x360 output.mp4

jpg转YUV420P

ffmpeg -i input.jpg -c:v rawvideo -pix_fmt yuv420p output.yuv

YUV420P(800x600)转jpg

ffmpeg -pix_fmt yuv420p -s 800x600 -i input.yuv output.jpg

保存RTSP为H264

ffmpeg -i rtsp://192.168.1.2/test.h264 ./output.h264

音频

wav转PCM(16bit, 16KHz, 单声道)

ffmpeg -i input.wav -f s16le -ar 16000 -ac 1 output.pcm

PCM(16bit, 16KHz, 单声道)转AAC

ffmpeg -f s16le -ar 16000 -ac 1 -i input.raw output.aac

FFplay

视频

播放mp4

ffplay input.mp4

播放YUV420P(800x600)

ffplay -pix_fmt yuv420p -s 800x600 output.yuv

低延时播放rtsp

ffplay -probesize 1000 rtsp://192.168.8.106/video2

音频

播放AAC

ffplay input.aac

播放PCM(16bit, 16KHz, 单声道)音频文件

ffplay -f s16le -ar 16000 -ac 1 input.pcm

【阿楠的blog】抖音用户信息爬取和FFmpeg常用命令总结

反爬虫字体混淆

二、FFmpeg

视频

音频

FFplay

视频

音频

你可能感兴趣的:(ffmpeg)