起点小说字体加密python TTFont解析流

起点小说详情加密 阅读数 点击数

起点目标网址:

第一步

静态请求 发现关键地方 点击数是乱码 类似
𘜄𘜈𘜂𘛾𘜆万字
起点小说字体加密python TTFont解析流_第1张图片

第二步

找到对应的字体加密的文件 用百度字体打开
起点小说字体加密python TTFont解析流_第2张图片
这不就是和页面上的乱码一模一样吗, 但是多次请求发现每一次的请求 字体文件对应的编号都不一样,这就说明每一次都会变化,需要每次读取这个文件 查询对应的(字体映射)
这里我们用 python
from fontTools.ttLib import TTFont

    world = TTFont('wwOMhmLd.ttf')
    # print(world.getGlyphOrder()) # 字典的value 字体文件有哪些值 对应的值
    ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

    # print(world.getBestCmap()) # 获得对应的字符对应的值
    {100181: 'two', 100183: 'zero', 100184: 'three', 100185: 'eight', 100186: 'seven', 100187: 'six', 100188: 'period', 100189: 'four', 100190: 'five', 100191: 'nine', 100192: 'one'}

第三步

走到这一步就简单了,读取静态页面,替换乱码 和上面的键一一对应,这样就知道

        number_dict = {
            '.notdef': "我在找我的派大星",
            'period': '.',
            'zero': '0',
            'one': '1',
            'two': '2',
            'three': '3',
            'four': '4',
            'five': '5',
            'six': '6',
            'seven': '7',
            'eight': '8',
            'nine': '9',
        }

PS:

python 字体处理的包 from fontTools.ttLib import TTFont
但是 这个是读取文件,修改TTFont 源代码
自己稍微修改了一下源代码 命名为 XYYTTFont 可以直接读取content流数据,这样就不用读取文件 直接请求链接的流数据,(避免今后多进程 文件名的问题,)

起点详情字体解密git地址:

欢迎star

我知道你很懒,下面是源码,有些包还是得去git 看,目录里还有其他的

# -*- coding:utf-8 -*-
# Chance favors the prepared mind.
# author : pyl owo,
# time : 2020/9/21
import json
import random
import re

from fake_useragent import UserAgent
from fontTools.ttLib import TTFont
from lxml import etree
import requests
from my_font_content import XYYTTFont

# 获取代理
def get_proxy():
    pass

# 统一请求函数
def unify_requests(method="GET",url="",headers={},proxies={},data={},verify=False,cookies={}):
    if method=="GET":
        response = requests.get(url, headers=headers,proxies=proxies,data=data,cookies=cookies,timeout=5)
        return response
    else:
        response = requests.post(url, headers=headers,proxies=proxies,data=data,verify=verify,cookies=cookies,timeout=5)
        return response
class SFQingNovel:
    def __init__(self, use_proxy=True):
        self.proxy = get_proxy() if use_proxy else None
        self.headers = {
            'User-Agent': UserAgent().random,
            "Proxy-Tunnel": str(random.randint(1, 10000)),
            'authority': 'book.qidian.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-site',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://www.qidian.com/',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'Cookie': 'newstatisticUUID=1600686041_884209914; _csrfToken=nXOEpjuFF7PUkwPJoOkBd7dTo2BV5jSkPu3suGGs'
        }
        # self.novel_url_pre = "https://t.shuqi.com/cover/"

    # 获取小说所有详细信息
    def get_novel_info(self, novel_url, **kwargs):
        respose = unify_requests(url=novel_url, headers=self.headers, proxies=self.proxy)
        search_result = self.parse_novel_info(respose, novel_url, **kwargs)
        return search_result

    def get_id(self, novel_url, **kwargs):
        return novel_url.split('/')[-1]

    def get_info(self, info_response, **kwargs):
        number_dict = {
            '.notdef': "薛忆阳",
            'period': '.',
            'zero': '0',
            'one': '1',
            'two': '2',
            'three': '3',
            'four': '4',
            'five': '5',
            'six': '6',
            'seven': '7',
            'eight': '8',
            'nine': '9',
        }
        response = info_response

        # 拿到下载字体的网址
        # @font-face.*?src: url.*?src: url(.*?) format('woff'),
        content = re.search(re.compile(r"@font-face.*?src: url.*?src: url(.*?)format.*?,", re.S), response.text)
        # ('https://qidian.gtimg.com/qd_anti_spider/yMxThZoL.woff')
        font_url = content.groups()[0].strip("( | )").strip("'")
        # print(font_url)

        font_content = unify_requests(url=font_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'}, proxies=self.proxy).content

        # with open('qidian_lx.woff', 'wb') as f:
        #     f.write(font_content)
        #
        # font1 = TTFont('qidian_lx.woff')
        # font1.saveXML('qidian_lx.xml')

        # 源码中提取十进制数据:

        data = re.findall(re.compile(
            r'
.*?

.*?(.*?).*?(.*?).*?(.*?).*?(.*?)', re.S), response.text)[0] four_list = [] # 遍历这四组数据 for d in data: # print(d) # 拿到元组中的一个 𘜸𘜹𘜹𘜶𘜽𘜽 one_list = [] d = d.split(';') # 去除分号 # 遍历每组数据 for x in d: res = x.replace('&#', '') if res: # 判断是否有空格有的话不转化 # 将res十进制转化成16进制 a = int(res) # 先转化成int类型 one_list.append(a) four_list.append(one_list) map_dict = XYYTTFont(font_content).getBestCmap() # print(map_dict) result_list = [] # 遍历含有四组数据的列表 for one in four_list: two_string = "" # 遍历每一组数据 for a in one: # print("a",a) if a in map_dict: number = map_dict[a] # 找到对应的键 number = number_dict[number] # 通过键找到对应的值 # print(number) two_string += number result_list.append(two_string) return result_list def get_int_num(self, numstr): if '.' in numstr: return int(numstr.replace('.','')) * 100 else: return int(numstr) # 搜索视频响应解析 def parse_novel_info(self, respose_info, novel_url='', **kwargs) -> dict: try: # print(novel_url) response_data = etree.HTML(respose_info.text) info_list = self.get_info(respose_info, **kwargs) except Exception as e: print(e) return {} else: # info_book_dict = info_dict.get('book', {}) novel_dict = dict() novel_dict['all_recommend_str'] = self.get_int_num(info_list[2]) # 总推荐数 str book_interact novel_dict['month_recommend_str'] = None # 月推荐数 str novel_dict['week_recommend_str'] = self.get_int_num(info_list[3]) # 周推荐数 str novel_dict['all_read_int'] = None # 总阅读数 int novel_dict['month_read_int'] = None # 月阅读数 int novel_dict['week_read_int'] = None # 周阅读数 int novel_dict['all_words_number_int'] = self.get_int_num(info_list[0]) # 总字数 novel_dict['book_status_str'] = response_data.xpath('//p[@class="tag"]/span/text()')[0] # 书籍状态 (连载,完结,暂无)bookCP novel_dict['book_property_str'] = response_data.xpath('//p[@class="tag"]/span/text()')[1] # 书籍属性 (免费,会员,限免) novel_dict['author_type_str'] = "".join(response_data.xpath('//div[@class="author-photo"]/span/text()')) # 作者类型 (金牌,签约,独立 默认无) novel_dict['book_type_str'] = '|'.join(response_data.xpath('//p[@class="tag"]/a/text()')) # 书籍分类 (玄幻 ,科幻,言情...)按搜索结果来多个按|分割 novel_dict['book_update_time'] = ''.join(response_data.xpath('//li[@class="update"]/div/p[@class="cf"]/em/text()')) # 书籍更新日期 年-月-日 novel_dict['book_zong_zhang_jie_int'] = '' # 书籍总的章节 完结的,未完结就填目前的总章节 novel_dict['book_zui_xin_zhang_jie_name_str'] = ''.join(response_data.xpath('//li[@class="update"]/div/p[@class="cf"]/a/text()')) # 最新章节名称 novel_dict['book_introduce_text'] = ''.join(response_data.xpath('//div[@class="book-intro"]/p//text()')).replace(' ', '').replace('\u3000', '').replace('\r', '').replace('\n', '').replace('\t', '') # 书籍简介 text novel_dict['book_lable_str'] = '|'.join(response_data.xpath('//p[@class="tag"]/a/text()')) # 书籍标签 (用|分割的字符串 ''科幻|现实|励志'') novel_dict['book_cover_image_str'] = "https:" + "".join(response_data.xpath('//div[@class="book-information cf"]/div[@class="book-img"]/a/img/@src')).replace('\n', '') # 书籍封面 URL novel_dict['book_detail_url_str'] = novel_url # 书籍详情URL novel_dict['book_detail_id_int'] = None # 书籍详情ID 数字形式 novel_dict['book_detail_id_str'] = None # 书籍详情ID 字符形式 novel_dict['book_zhan_dian_str'] = None # 书籍站点 (男生,女生,暂无) novel_dict['book_publish_str'] = '起点中文网' # 出版社 默认侵权平台' novel_dict['book_commeds_int'] = None # 书籍评论数 Pinglunfont novel_dict['author_grade_float'] = None # 作者评分 novel_dict['author_id_str'] = None # 作者ID 字符形式 ## 新增 novel_dict['author_page_url_str'] = "https:" + ''.join(response_data.xpath('//a[@class="writer"]/@href')) # 作者主页链接 userId author_info_data = response_data.xpath('//ul[@class="work-state cf"]/li/em/text()') novel_dict['author_book_number_int'] = author_info_data[0] # 作者书籍总数 novel_dict['author_likes_int'] = None # 作者获赞总数 novel_dict['author_all_words_number_str'] = author_info_data[1] # 作者累计创作字数 novel_dict['author_produce_days_str'] = author_info_data[2] # 作者累计创作天数 novel_dict['author_fens_number_int'] = None # 作者粉丝数 novel_dict['author_head_image_url_str'] = "https:" + "".join(response_data.xpath('//div[@class="author-photo"]/a/img/@src')) # 作者头像URL # novel_dict[''] = '' # return novel_dict # 统一的调用 search_novels search_novel_info = SFQingNovel(use_proxy=True).get_novel_info if __name__ == "__main__": result = search_novel_info('https://book.qidian.com/info/1010734492') print(result)

你可能感兴趣的:(爬虫档案袋,python)