20200113Python爬虫---猫眼字体反爬

在做过58同城的字体反爬过后,信心稍微增长点,那么索性找点字体文件反爬的网址,猫眼是客户要求的网址,那么便开始搞起来。

 

目标网址

猫眼经典电影

https://maoyan.com/films?sourceId=2&yearId=15&showType=3&offset=0

很明显和汽车之家的论坛是字体的加密,那么按照之前的步骤走:
第一步:找到字体文件,下载下来。
第二步:通过Font Creator工具读取下载好的字体文件。
第三步:按顺序拿到各个字符的unicode编码,并写出对应的文字列表。
第四步:将顺序unicode编码写成对应的unicode的类型。
第五步:替换掉文章中对应的unicode的类型的文字。
按照上边的简单的步骤一顿乱搞,发现最后更本就不对,这点作者是深有体会,不禁想这是为什么?想了好久也没有想太明白,那么只有去网上找资源,还好关于猫眼的爬虫不少,作者这里参考了他们的思路:

目的

采集的2019、2018、2017的电影信息,字段含电影名、上演时间、上演地区、市场、评分、票房、评价总人次(本次源码里无)、演员,其余字段就靠大家积极补充了。(个人能力有限及时间有限)

分析

什么猫眼不同于汽车之家呢?这里最重要的一点是,猫眼这里的字体是多个字体文件,每次访问会给你返回不同的字体的文件,这也就是为什么对不上字体的原因。索性访问两次下载两套字体进行对比,如下图:

20200113Python爬虫---猫眼字体反爬_第1张图片

可以发现两次虽然是同一个文字但是字符编码发生了变化,完全和第一个不一样了,这个时候怎么解决呢?不禁考虑虽然对应的字符编码是改变的,那么同一个字体的结构会发生变化吗?(当然这是后话,以后自己可以往这个方向考虑),事实可以通过查看两个个字体文件内部数据结构,方法如下:

先手动查看源码找到对应的文件直接下载,如下图:

20200113Python爬虫---猫眼字体反爬_第2张图片

本次作者下载woff地址是:vfile.meituan.net/colorstone/c4e1abb1a992f1f2bf87821129b3a40d2280.woff

 

 
from fontTools.ttLib import TTFont
font = TTFont('test.woff')  # 打开本地字体文件online_base64.ttf
font.saveXML('test.xml')  # 将ttf文件转化成xml格式并保存到本地,主要是方便我们查看内部数据结构

打开test.xml文件可以看到类似html标签的结构.这里我们用到的标签是如下图:

20200113Python爬虫---猫眼字体反爬_第3张图片

 

那么看一下标签,内包含着所有编码信息,注意前两个是不是0-9的编码,需要去除。如下图

20200113Python爬虫---猫眼字体反爬_第4张图片

在看一下标签 内包含着每一个字符对象,同样第一个和最后一个不是0-9的字符,需要祛除。在这里自己有点小理解:可以打开另外一个字体的结构,发现对象的结构是不变的,变化的只是对应的字符编码。

20200113Python爬虫---猫眼字体反爬_第5张图片

没时间了,先上源码莫嫌弃(后期在慢慢完善):

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : cj.py
# @Author: itlaolang
# @Date  : 2020-01-13 04:39
# @Contact : [email protected] 
# @Software : PyCharm
# @Desc  :描述
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : caiji.py
# @Author: itlaolang
# @Date  : 2020-01-07 16:51
# @Contact : [email protected]
# @Software : PyCharm
# @Desc  :采集猫眼电影
import os
import json
from  xml.etree import ElementTree as ET
# r = ET.parse('web.xml')
# print(r)
from xml.dom.minidom import parse
import xmltodict
from fontTools.ttLib import TTFont


def printtree(p):

    if p.hasChildNodes()==0:
        return

    else:
        for x in p.childNodes:
            printtree(x)

def th(xmldoc1):

    xmldoc = parse(xmldoc1)
    text = json.dumps(xmltodict.parse(xmldoc.toxml()), indent=4)
    # print(text)
    text_json = json.loads(text)

    list = []
    list1 = []
    num = 0
    for k, v in text_json.get('ttFont').items():

        if k == 'cmap':
            list = v.get('cmap_format_4')
        if k == 'glyf':
            list1 = v.get('TTGlyph')
    ysbm = {}
    for i in list:
        if i['@platEncID'] == '3':
            num1 = 0
            for ii in i['map']:
                if num1 != 0 :
                    ysbm[ii['@name']] = ii['@code'][1:]
                num1 = num1 + 1
    print(ysbm)
    temp1={}
    num2 = 0
    for i in list1:
        # print(i['@name'])
        if num2 != 0 :
            for k, v in i.items():
                ybm = ''
                data1=[]

                if k == 'contour':
                    if type(v)==dict:
                        for iiii in v.get('pt'):
                            ybm = ybm+json.dumps(iiii)
                            data1.append(iiii)
                   # iiii         print(json.dumps(iiii))
                    else:
                        for iiii in v:
                            for iiii in iiii.get('pt'):
                                ybm = ybm+json.dumps(iiii)
                                data1.append(iiii)
                    # print('))))))))')
                # print(k)
                # print(ybm)
                    temp1[i['@name']]=data1
        num2 = num2 + 1
    # print(temp1)
    data1={}
    data2={}
    for k,v in ysbm.items():
        data2[v]=temp1[k]
    # print(data2)

    return data2
def test():
    xmldoc1 = parse("test.xml")
    text = json.dumps(xmltodict.parse(xmldoc1.toxml()), indent=4)
    text_json = json.loads(text)

    list = []
    list1 = []
    num = 0
    for k, v in text_json.get('ttFont').items():

        if k == 'cmap':
            list = v.get('cmap_format_4')
        if k == 'glyf':
            list1 = v.get('TTGlyph')
    ysbm = {}
    for i in list:
        if i['@platEncID'] == '3':
            num1 = 0
            for ii in i['map']:
                if num1 != 0:
                    ysbm[ii['@name']] = ii['@code'][1:]
                num1 = num1 + 1
    temp1 = {}
    num2 = 0
    for i in list1:
        # print(i['@name'])
        if num2 != 0:
            for k, v in i.items():
                ybm = ''
                data1 = []

                if k == 'contour':
                    if type(v) == dict:
                        for iiii in v.get('pt'):
                            ybm = ybm + json.dumps(iiii)
                            data1.append(iiii)
                    # iiii         print(json.dumps(iiii))
                    else:
                        for iiii in v:
                            for iiii in iiii.get('pt'):
                                ybm = ybm + json.dumps(iiii)
                                data1.append(iiii)
                    # print('))))))))')
                    # print(k)
                    # print(ybm)
                    temp1[i['@name']] = data1
        num2 = num2 + 1
    # print(temp1)
    data1 = {}
    data2 = {}
    j=["3","1","0","5","2","6","7","9","8","4"]
    numb=0
    for k, v in ysbm.items():
        data2[j[numb]] = temp1[k]
        numb=numb+1
    # print(data2)

    return data2

def tihuan(s,ss):
    data={}
    for k,v in s.items():
        # print(v)
        for i in v:
            for k1,v1 in ss.items():
                for ii in v1:
                    if i==ii:
                        data[k]=k1
                        break

        print('______________')
    print(data)
    return data



import requests
from bs4 import BeautifulSoup
import re
import time
import csv


def xie(aa):
    b = [aa[0],aa[1], aa[2], aa[3],aa[4],aa[5],aa[6]]

    c = []
    c.append(b)


    with open("rul18.csv", "a+") as csvfile:
        writer = csv.writer(csvfile)

        # 先写入columns_name
        # writer.writerow(["index", "a_name", "b_name"])
        # 写入多行用writerows
        writer.writerows(c)
heard={
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Cookie':'__mta=252426550.1578386784263.1578880752527.1578880757029.106; uuid_n_v=v1; uuid=305C6350312A11EAAFF74B96B91393FE697BD1F626FD454B9C9BE17C4573B069; _csrf=912fb7d87b6b2be10fa6c006c7f8ff3dca2e8046d150df287ae7b43373a62fe9; _lxsdk_cuid=16f7f2efd26c8-06db984d1a3a07-1d376b5b-13c680-16f7f2efd26c8; mojo-uuid=3628b7cca1fbe628e02e7217d561824a; lt=uWHePaJz08QdlIPJYXQc4EwhOp8AAAAAyQkAAHvmgHwm3_XWExp-YRStdbxmhB4H3lUFAlMqeZT2cVPczJr6q_RBKT-rbRuf6-2kdg; lt.sig=DWBPDQrL5UqBOS-B49b5VJfzr7c; theme=moviepro; _lxsdk=305C6350312A11EAAFF74B96B91393FE697BD1F626FD454B9C9BE17C4573B069; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1578663920,1578668029,1578827974,1578855616; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"1cb15fec3630c1c5d56b4c9512f97fa1","time":1578880176906}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578880909; __mta=252426550.1578386784263.1578880757029.1578880909045.107; mojo-trace-id=77; _lxsdk_s=16f9c978a6c-233-4ff-215%7C2796253285%7C102'
}
def xq(xqurl):
    # xqurl='https://maoyan.com/films/1279731'
    html = requests.get(xqurl, headers=heard).text

    # print(html.find("'woff'"))
    # print(html[html.find("embedded-opentype")+39:html.find("'woff'")-10])
    woff=html[html.find("embedded-opentype")+39:html.find("'woff'")-10]
    wofflink = 'http://' + woff

    r = requests.get(wofflink, headers=heard)

    with open('tt.woff', "wb") as f:
        f.write(r.content)
    f.close()
    font1 = TTFont('tt.woff')  # 读取woff文件
    font1.saveXML('tt.xml')  # 转成xml
    data=tihuan(th('tt.xml'),test())

    # 
    for k ,v in data.items():
        print(k,v)
        html=html.replace("&#"+k+";",v)

    import time
    # time.sleep(600)

    htmltxet = BeautifulSoup(html, "html.parser")
    pianming=htmltxet.find("h3",attrs={"class":"name"}).text.strip()
    leixing=htmltxet.find_all("li",attrs={"class":"ellipsis"})[0].text.strip()
    shany = htmltxet.find_all("li", attrs={"class": "ellipsis"})[1].text.strip()
    dalu = htmltxet.find_all("li", attrs={"class": "ellipsis"})[2].text.strip()
    print(pianming)
    print(leixing,shany,dalu)
    # pingfen=re.search('.')
    if htmltxet.find("span", attrs={"class": "stonefont"}) != None:

        pingfen1 = htmltxet.find("span", attrs={"class": "stonefont"}).text
    else:
        pingfen1 = '暂无'
    if htmltxet.find("div",attrs={"class":"movie-index-content box"}) != None:

        pioafang=htmltxet.find("div",attrs={"class":"movie-index-content box"}).text
    else:
        pioafang = '暂无'

    print(pioafang)
    print(pingfen1)
    yy=''
    for i in htmltxet.find_all("li",attrs={"class":"celebrity actor"}):
        print(i.find('a',attrs={"class":"name"}).text.strip())
        yy=yy+","+i.find('a',attrs={"class":"name"}).text.strip()
        print("44$$$$$$$$$$$$$$")
    data=[]
    data.append(pianming)
    data.append(leixing)
    data.append(shany)
    data.append(dalu)
    data.append(pingfen1)
    data.append(pioafang)
    data.append(yy)
    xie(data)


d=0
for ii in range(6,68):
    print(ii)
    url="https://maoyan.com/films?sourceId=2&yearId=13&showType=3&sortId=3&offset="+str(ii*30)
    html=requests.get(url,headers=heard).text

    htmltxet = BeautifulSoup(html, "html.parser")
    for i in htmltxet.find_all("div",attrs={"class":"movie-item"}):
        # print(i.find('a').get('href'))
        xqurl="https://maoyan.com"+i.find('a').get('href')
        d=d+1
        print("第{0}个电影链接:{1}".format(d,xqurl))
        xq(xqurl)
        # time.sleep(4)
# print(htmltxet)

源码文件下载链接:https://download.csdn.net/download/qq_31151511/12099755

 

你可能感兴趣的:(20200113Python爬虫---猫眼字体反爬)