python字体反爬

python   字体反爬

import re

import base64

import requests

import urllib.request as down

from fontTools.ttLib import TTFont  # 字体解析库

from xml.etree.ElementTree import parse

from difflib import SequenceMatcher  # 序列匹配器

def similarity(a, b):

    """

    对比a,b序列,返回相似度(0~1.0)

    eg:

    list1=[1,2,3,4,5]

    list2=[1,2,3]

    similarity(list1,list2)

    return 0.75

    :param a: 序列a

    :param b: 序列b

    :return:  相似度(0~1.0)

    """

    return SequenceMatcher(None, a, b).ratio()  # 引用ratio方法,返回序列相似性的度量

def get_movie_infos() -> list:

    """

    请求电影数据,并且保存字体文件

    :return:

    """

    cookies = {

        '_lxsdk_cuid': 'xxxxxxxxxxxxxxxxxxx-412f2c3d-1aeaa0-18787e3c85cc8',

        '_lxsdk_s': '1xxxxxxxxxd-efe-fce-1c8%7C%7C4',

        '_lxsdk': '18787e3c85cc8-047db80ef7d2d98-412f2c3d-1aeaa0-18787e3c85cc8',

        '_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',

    }

# 自行去网页上拿

    headers = {

        'User-Agent': '用自己的T T',

        'Accept': 'application/json, text/plain, */*',

        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

        'Accept-Encoding': 'gzip, deflate, br',

        'X-FOR-WITH': '这里应该是出发验证码的参数,同一个请求太多可能会验证码',

        'Connection': 'keep-alive',

        'Referer': 'https://piaofang.maoyan.com/dashboard',

        'Sec-Fetch-Dest': 'empty',

        'Sec-Fetch-Mode': 'cors',

        'Sec-Fetch-Site': 'same-origin',

    }

# 自行去网页上拿

    params = (

        ('orderType', '0'),

        ('uuid', '18787e3c85cc8-047db80ef7d2d98-412f2c3d-1aeaa0-18787e3c85cc8'),

        ('timeStamp', '1681613813837'),

        ('User-Agent', '1234xxxxxx'),

        ('index', '838'),

        ('channelId', '40009'),

        ('sVersion', '2'),

        ('signKey', '12xxxxxxx'),

    )

    response = requests.get(url, headers=headers, params=params,

                            cookies=cookies)

    res = response.json()

    font_style = re.findall('\"([\s\S]*?)\"', res['fontStyle'][::-1])

    font_url = 'http:' + font_style[0][::-1]  # 字体url链接

    print(f'字体文件链接:{font_url}')

    down.urlretrieve(font_url, file_name + '.woff')

    data = res['movieList']['data']['list']

    return data

def get_font_map() -> dict:

    """

    获得字体对象,将获得到的字体关系映射存为字典保存

    :return:

    """

    with TTFont(file_name + '.woff') as font:

        font.saveXML(file_name + '.xml')  # 保存为XML格式

        f = font.getBestCmap()  # 获取字体关系映射 code="0xe359"——name="uniE359"

    font_map = {}

    for num, code in f.items():

        if code == 'x': continue

        font_map[code] = hex(num)  # 存入字典

    return font_map

def read_pt_from_xml() -> list:

    """

    获取加载字体的on属性列表字典

    :return: on列表字典

    """

    xml = parse(file_name + '.xml')

    root = xml.getroot()

    for i in root:

        if i.tag == 'glyf':

            nums_on_dict = {}

            for j in i:

                if j.get('name') == 'glyph00000' or j.get('name') == 'x':

                    continue

                on_list = []

                for k in j:

                    for l in k:

                        on_list.append(l.get('on'))

                nums_on_dict[j.get('name')] = on_list

            break

    print(f'on列表:{nums_on_dict}')

    return nums_on_dict

def matching() -> dict:

    """

    匹配on列表

    对数字进行匹配,当匹配相似度>=0.9时,则匹配成功

    :return: 返回加密代码 和数字的字典

    """

    num_code = {}

    for number, on_list in nums_matching.items():

        for _number, _on_list in new_font_maps.items():

            for i in on_list:

                if similarity(i, _on_list) >= 0.9:  # >=0.9 则成功匹配

                    num_code[my_font_maps[_number].replace('0x', '&#x')] = str(number)

                    break

    print(f'解析完成的数字对应关系:{num_code}')

    return num_code

def handle_font():

    """

    处理字体,返回真实数据

    :return:

    """

    movie_num = {}  # 综合票房数字典

    movie_day_one = {}  # 上映首日数量

    movie_rate = {}  # 票房占比

    movie_show_count = {}  # 排片场次

    movie_viewer_avg = {}  # 场均人数

    movie_infos = {}

    # 页面内容

    for data in datas:

        movie_name = data['movieInfo']['movieName']

        movie_num[movie_name] = data['boxSplitUnit']['num']

        movie_day_one[movie_name] = data['sumBoxDesc']

        movie_rate[movie_name] = data['splitBoxRate']

        movie_show_count[movie_name] = data['showCount']

        movie_viewer_avg[movie_name] = data['avgShowView']

    # 替换加密字体,获得正确数字,并输出结果

    # 对加密的字体遍历分组,并去除无用字符

    for name, number_code in movie_num.items():

        movie_num[name] = re.findall('([\S]*?);', number_code)

    # 根据得到的num_code_map对加密字体进行替换,得到正确数值

    for index, (name, number_code_list) in enumerate(movie_num.items()):

        num = []

        # 替换操作

        for code in number_code_list:

            if '.' in code:

                code = code.replace('.', '')

                num.append('.' + num_code_map[code])

            else:

                num.append(num_code_map[code])

        infos = ['排行:' + str(index + 1),

                 '片名', name,

                 '上映首日', movie_day_one[name],

                 '票房', ''.join(num) + '万',

                 '票房占比', movie_rate[name],

                 '场均人数', movie_viewer_avg[name] + '人',

                 '排片场次', movie_show_count[name]]

        print(infos)

if __name__ == '__main__':

    nums_matching = {

        0: [['1', '0', '1', '0', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '0',

             '1',

             '0', '0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0'],

            ['1', '0', '1', '0', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '0',

             '1',

             '0', '0', '0', '0', '0', '0', '0', '1', '0']],

        1: [['1', '1', '1', '0', '0', '1', '1', '0', '1', '0', '0', '1', '1']],

        2: [['1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '0', '1', '0',

             '1',

             '1', '0', '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1'],

            ['1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '1', '1', '0',

             '0',

             '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '1', '1', '0', '1']],

        3: [['1', '0', '0', '1', '0', '0', '1', '0', '1', '0', '1', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1',

             '0',

             '1', '0', '1', '0', '0', '1', '1', '0', '1', '0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1',

             '0',

             '1', '0', '1', '0', '0', '1'],

            ['1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '1',

             '1',

             '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1']],

        4: [['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']],

        5: [['1', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '0', '0',

             '1',

             '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1']],

        6: [['1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '0',

             '1',

             '0', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0',

             '1',

             '0', '1', '0', '1', '0']],

        7: [['1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '1', '0', '0', '1']],

        8: [['1', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '0',

             '1',

             '0', '0', '1', '0', '1', '0', '0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '0', '0', '1',

             '0',

             '0', '0', '1', '0', '1', '0'],

            ['1', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1',

             '0',

             '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0']],

        9: [['1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1',

             '0',

             '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '1', '0',

             '1',

             '0', '1', '0', '1', '0', '0', '1', '0'],

            ['1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1',

             '0',

             '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '1', '0',

             '1',

             '0', '1', '0', '1', '0', '0', '1', '0'],

            ['1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1',

             '0',

             '0', '1', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '1', '0', '0', '1', '0',

             '0',

             '0', '0'],

            ['1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0',

             '0',

             '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0']]

    }

    url = 'aHR0cHM6Ly9waWFvZmFuZy5tYW95YW4uY29tL2Rhc2hib2FyZC1hamF4'

    url = base64.b64decode(url).decode('utf-8')

    # 爬取内容

    file_name = 'maoyan'

    # 请求电影数据,并且保存字体文件

    datas = get_movie_infos()

    # 存储到的字体关系

    my_font_maps = get_font_map()

    # 当前页面字体文件的map on列表

    new_font_maps = read_pt_from_xml()

    # 加密代码 和数字的字典

    num_code_map = matching()

    # 处理字体,解析成正确数据

    handle_font()

你可能感兴趣的:(python)