一、最近在尝试抓抖音用户数据,抓到了一个share_url(下面附有代码),抓取的过程中发现抖音的账号、点赞数、作品数都是以字符集的形式显示,目的是防止爬虫抓取,细节,细节,细节(重要的事情说三遍)字符集的形式后面文章会讲。
数据抓取
访问过去看了下,
https://www.iesdouyin.com/sha...
有数据的。那就拿数据。
然后发现它对数字做了字符集映射。
搞他。
下载.ttf的文件,s3a.bytecdn.cn/ies/resource/falcon/douyin_falcon/static/font/iconfont_da2e2ef.ttf,这个便用font creator软件打开,看到这个图片我们就明白了字体与数字的关系
既然我们看到num_对应数字1,num_8对应数字7,那这个num_8们怎么得到,与‘ ’有啥关系?
2.这个时候,需要大家安装pip install fontTools,使用fontTool打开ttf文件转化成xml文件,
采用下面代码
from fontTools.ttLib import TTFont
font_1 = TTFont('douyin.ttf')
font_1.saveXML('font_1.xml')
我们看到了font_1.xml,
这就能看出来了
import re
def getDouyinNum(douIDNumCode):
mapCode2Font = {
'0xe602': 'num_',
'0xe603': 'num_1',
'0xe604': 'num_2',
'0xe605': 'num_3',
'0xe606': 'num_4',
'0xe607': 'num_5',
'0xe608': 'num_6',
'0xe609': 'num_7',
'0xe60a': 'num_8',
'0xe60b': 'num_9',
'0xe60c': 'num_4',
'0xe60d': 'num_1',
'0xe60e': 'num_',
'0xe60f': 'num_5',
'0xe610': 'num_3',
'0xe611': 'num_2',
'0xe612': 'num_6',
'0xe613': 'num_8',
'0xe614': 'num_9',
'0xe615': 'num_7',
'0xe616': 'num_1',
'0xe617': 'num_3',
'0xe618': 'num_',
'0xe619': 'num_4',
'0xe61a': 'num_2',
'0xe61b': 'num_5',
'0xe61c': 'num_8',
'0xe61d': 'num_9',
'0xe61e': 'num_7',
'0xe61f': 'num_6',
}
mapFont2Num = {
'num_': 1,
'num_1': 0,
'num_2': 3,
'num_3': 2,
'num_4': 4,
'num_5': 5,
'num_6': 6,
'num_7': 9,
'num_8': 7,
'num_9': 8,
}
douIDNum = ''
map1 = {}
if douIDNumCode == '':
return ''
for i in douIDNumCode:
j = i.replace(' ', '0').replace('; ', '')
map1[j] = str(mapFont2Num[mapCode2Font[j]])
return map1
def req(share_id):
url2= 'https://www.iesdouyin.com/share/user/{}'.format(share_id)
headers ={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/74.0.3729.169 Safari/537.36"
}
res=requests.get(url2,verify=False,headers=headers)
return res
def parse(res):
description_header = re.findall('抖音ID: (.*?) ',res.text,re.S)
description_back = re.findall('(.*?)',res.text,re.S)
mapDict = getDouyinNum(douIDNumCode=description_back)
res = description_header[0]
for i,j in mapDict.items():
res = res.replace(' '+i[1:]+'; ',j)
print(res)
if __name__ == '__main__':
res = req(102064772608)
parse(res)
import re
import requests
from lxml import etree
'''
抖音用户基本信息 -> 请求share来获取数据
'''
def handle_decode(input_data):
# 匹配icon font
regex_list = [
{'name': [' ', ' ', ' '], 'value': 0},
{'name': [' ', ' ', ' '], 'value': 1},
{'name': [' ', ' ', ' '], 'value': 2},
{'name': [' ', ' ', ' '], 'value': 3},
{'name': [' ', ' ', ' '], 'value': 4},
{'name': [' ', ' ', ' '], 'value': 5},
{'name': [' ', ' ', ' '], 'value': 6},
{'name': [' ', ' ', ' '], 'value': 7},
{'name': [' ', ' ', ' '], 'value': 8},
{'name': [' ', ' ', ' '], 'value': 9},
]
for i1 in regex_list:
for i2 in i1['name']:
input_data = re.sub(i2, str(i1['value']), input_data) # 把正确value替换到自定义字体上
html = etree.HTML(input_data)
douyin_info = {}
# 获取昵称
douyin_info['nick_name'] = html.xpath("//div[@class='personal-card']/div[@class='info1']//p[@class='nickname']/text()")[0]
# 获取抖音ID
douyin_id = html.xpath("//div[@class='personal-card']/div[@class='info1']/p[@class='shortid']//text()")
douyin_info['douyin_id'] = ''.join(douyin_id).replace('抖音ID:', '').replace(' ', '')
# 职位类型
try:
douyin_info['job'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/div[@class='verify-info']/span[@class='info']/text()")[0].strip()
except:
pass
# 描述
douyin_info['describe'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='signature']/text()")[0].replace('\n', ',')
# 关注
douyin_info['follow_count'] = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='focus block']//i[@class='icon iconfont follow-num']/text()")[0].strip()
# 粉丝
fans_value = ''.join(html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='follower block']//i[@class='icon iconfont follow-num']/text()"))
unit = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='follower block']/span[@class='num']/text()")
if unit[-1].strip() == 'w':
douyin_info['fans'] = str(float(fans_value) / 10) + 'w'
else:
douyin_info['fans'] = fans_value
# 点赞
like = ''.join(html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='liked-num block']//i[@class='icon iconfont follow-num']/text()"))
unit = html.xpath("//div[@class='personal-card']/div[@class='info2']/p[@class='follow-info']//span[@class='liked-num block']/span[@class='num']/text()")
if unit[-1].strip() == 'w':
douyin_info['like'] = str(float(like) / 10) + 'w'
else:
douyin_info['like'] = like
return douyin_info
def handle_douyin_info(url):
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
}
response = requests.get(url=url, headers=header)
return handle_decode(response.text)
if __name__ == '__main__':
url = 'https://www.iesdouyin.com/share/user/102064772608'
print(handle_douyin_info(url))
反爬虫字体混淆
代码例子, 猫眼的字体反爬虫已经升级了, 不再是简单的顺序关系, 这样取到的值依然是错的. 需要根据根据各个字体里面的字体定义TTGlyph.contour的值去判断具体数字.
import requests
import re
import os
from fontTools.ttLib import TTFont
class MaoYan(object):
def __init__(self):
self.url = 'https://maoyan.com/films/42964'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
# 发送请求获得响应
def get_html(self, url):
response = requests.get(url, headers=self.headers)
return response.content
# 创建 self.font 属性
def create_font(self, font_file):
# 列出已下载文件
file_list = os.listdir('./fonts')
# 判断是否已下载
if font_file not in file_list:
# 未下载则下载新库
print('不在字体库中, 下载:', font_file)
url = 'http://vfile.meituan.net/colorstone/' + font_file
new_file = self.get_html(url)
with open('./fonts/' + font_file, 'wb') as f:
f.write(new_file)
# 打开字体文件,创建 self.font属性
self.font = TTFont('./fonts/' + font_file)
self.font.saveXML('./fonts/' + font_file + '.xml')
# 把获取到的数据用字体对应起来,得到真实数据
def modify_data(self, data):
print(data);
# 获取 GlyphOrder 节点
gly_list = self.font.getGlyphOrder()
# 前两个不是需要的值,截掉
gly_list = gly_list[2:]
# 枚举, number是下标,正好对应真实的数字,gly是乱码
for number, gly in enumerate(gly_list):
# 把 gly 改成网页中的格式
gly = gly.replace('uni', '').lower() + ';'
# 如果 gly 在字符串中,用对应数字替换
if gly in data:
data = data.replace(gly, str(number))
# 返回替换后的字符串
return data
def start_crawl(self):
html = self.get_html(self.url).decode('utf-8')
# 正则匹配字体文件
font_file = re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', html)[0]
print(font_file);
self.create_font(font_file)
# 正则匹配星级
star = re.findall(r'\s+(.*?)\s+', html)[0]
star = self.modify_data(star)
# 正则匹配评论的人数
people = ''.join(re.findall(r'''(.*?万)(人评分)''', html)[0])
people = self.modify_data(people)
# 正则匹配累计票房
ticket_number = ''.join(re.findall(r'''(.*?)(亿)''', html)[0])
ticket_number = self.modify_data(ticket_number)
print('用户评分: %s 星' % star)
print('评分人数: %s' % people)
print('累计票房: %s' % ticket_number)
if __name__ == '__main__':
maoyan = MaoYan()
maoyan.start_crawl()
二、FFmpeg
熟练使用FFmpeg的常用命令,可以验证音视频处理的中间数据转换过程是否正确。以下是平时使用的一些命令总结。
视频
MP4转H264
ffmpeg -i input.mp4 -c:v libx264 -profile:v baseline output.h264
H264转MP4,缩放
ffmpeg -i input.h264 -c:v libx264 -profile:v baseline -vf scale=640x360 output.mp4
jpg转YUV420P
ffmpeg -i input.jpg -c:v rawvideo -pix_fmt yuv420p output.yuv
YUV420P(800x600)转jpg
ffmpeg -pix_fmt yuv420p -s 800x600 -i input.yuv output.jpg
保存RTSP为H264
ffmpeg -i rtsp://192.168.1.2/test.h264 ./output.h264
音频
wav转PCM(16bit, 16KHz, 单声道)
ffmpeg -i input.wav -f s16le -ar 16000 -ac 1 output.pcm
PCM(16bit, 16KHz, 单声道)转AAC
ffmpeg -f s16le -ar 16000 -ac 1 -i input.raw output.aac
FFplay
视频
播放mp4
ffplay input.mp4
播放YUV420P(800x600)
ffplay -pix_fmt yuv420p -s 800x600 output.yuv
低延时播放rtsp
ffplay -probesize 1000 rtsp://192.168.8.106/video2
音频
播放AAC
ffplay input.aac
播放PCM(16bit, 16KHz, 单声道)音频文件
ffplay -f s16le -ar 16000 -ac 1 input.pcm