字体反爬(猫眼电影)四 完成爬取(附源代码)

获取原始数据:3e983e35f4c5ff7f92fe911dd2273ca52280.woff
获取网址:http://vfile.meituan.net/colorstone/3e983e35f4c5ff7f92fe911dd2273ca52280.woff

from urllib import request
from fake_useragent import UserAgent
import re
import numpy as np
import requests
import os
from fontTools.ttLib import TTFont

ur = 'https://maoyan.com/films/1212'
headers = {
    'Connection': 'keep-alive',
    'Cookie': '__mta=107330365.1581993476301.1581995900678.1581995940325.4; uuid_n_v=v1; uuid=49E548F0517811EA93B3D74AE6BFD56770454DC8800844C697BB6401A76C066B; _csrf=e24b3af274802a2ed30dba2785f8b4ab91ca3faa823f5c790be4d0de09a4434c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1581993418; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1581995940; _lxsdk_cuid=1705627dd149-08a7621bd0fda88-4c302879-100200-1705627dd168f; _lxsdk=49E548F0517811EA93B3D74AE6BFD56770454DC8800844C697BB6401A76C066B; mojo-uuid=60b57ad9f85813b79216caa9040af2b8; _lxsdk_s=170564d03d9-916-68-dd%7C%7C11; mojo-trace-id=5; mojo-session-id={"id":"0227b873553f93f75b3f57c74319e8a5","time":1581995856552}; __mta=47029656.1581995936050.1581995936050.1581995936050.1',
    'User-Agent': UserAgent().random
}


# 获得html
def d_html(url, headers):
    req = request.Request(url, headers=headers)
    html = request.urlopen(req).read()
    return html


html = d_html(ur, headers).decode('utf-8')
url = 'https:' + re.findall('//[\w]+.[^\s]+.woff', html)[0]


# 获得woff文件
def getwoff(url):
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    woff_dir = os.path.join(base_dir, r"D:\PycharmProjects\untitled\爬虫/")
    file_name = url.split("/")[-1]
    save_woff = os.path.join(woff_dir, file_name)
    resp = requests.get(url=url)
    with open(save_woff, "wb") as f:
        f.write(resp.content)
        f.close()
    path = woff_dir + file_name
    return path

#获得数字(ocr)对应的编码
def getdick(path, ocr):
    font1 = TTFont(r'D:\PycharmProjects\untitled\爬虫\3e983e35f4c5ff7f92fe911dd2273ca52280.woff')  # 打开本地字体文件01.ttf
    uni_list1 = font1.getGlyphOrder()[2:]  # 获取所有编码,去除前2个
    print(uni_list1)
    font2 = TTFont(path)  # 打开访问网页新获得的字体文件02.ttf
    uni_list2 = font2.getGlyphOrder()[2:]
    b = []
    c = []
    for uni1, code in zip(uni_list1, ocr):
        obj1 = font1['glyf'][uni1]  # 获取编码uni2在02.ttf中对应的对象
        b.append(code)
        c.append(obj1.coordinates)
    a5 = 0
    a8 = 0
    a1 = 0
    a2 = 0
    a3 = 0
    a4 = 0
    a6 = 0
    a7 = 0
    a9 = 0
    a0 = 0
    b5 = []

    for uni2 in uni_list2:
        e5 = 0
        e8 = 0
        e0 = 0
        e1 = 0
        e2 = 0
        e3 = 0
        e4 = 0
        e6 = 0
        e7 = 0
        e9 = 0
        obj2 = font2['glyf'][uni2]  # 获取编码uni2在02.ttf中对应的对象
        b = obj2.flags

        for i in c[0]:
            if i in obj2.coordinates:
                e0 = e0 + 1
            if e0 > 3:
                a0 = uni2
        for j in c[1]:
            if j in obj2.coordinates:
                e1 = e1 + 1
            if e1 > 3:
                a1 = uni2

        for j in c[3]:
            if j in obj2.coordinates:
                e3 = e3 + 1
            if e3 > 3:
                a3 = uni2
        if np.mean(b) == 1:
            a4 = uni2
        for i in c[5]:
            if i in obj2.coordinates:
                e5 = e5 + 1
            if e5 > 3:
                a5 = uni2
        for j in c[6]:
            if j in obj2.coordinates:
                e6 = e6 + 1
            if e6 > 3:
                a6 = uni2
        for i in c[7]:
            if i in obj2.coordinates:
                e7 = e7 + 1
            if e7 > 3:
                a7 = uni2
        for i in c[8]:
            if i in obj2.coordinates:
                e8 = e8 + 1
            if e8 > 3:
                a8 = uni2
        for j in c[9]:
            if j in obj2.coordinates:
                e9 = e9 + 1
            if e9 > 3:
                a9 = uni2

    b5.append(a0)
    b5.append(a1)
    b5.append(a2)
    b5.append(a3)
    b5.append(a4)
    b5.append(a5)
    b5.append(a6)
    b5.append(a7)
    b5.append(a8)
    b5.append(a9)
    # 确定了8个剩下那个就是1对应的编码
    for i in uni_list2:
        if i not in b5:
            j = 0
            while j < len(b5):
                if b5[j] == 0:
                    b5[j] = i
                j = j + 1
    print(b5)
    return b5


# 字符串转化为数组
def zhuanhuan(ocr):
    ocr1 = []
    for i in list(ocr):
        ocr1.append(i)
    return ocr1

#获得原始数据(未解码之前)
def gettext(html, ocr, dic):
    str = ""
    str1 = '(.*?)万人评分'
    stary = re.findall(r'\s+(.*?)\s+', html)[0]
    starz = re.findall(str + str1, html)[0]
    piao = re.findall(r'(.*?)亿', html)[0]
    pingfen = stary.split('.')[0].split(';')
    zpingfen = starz.split('.')[0].split(';')
    zpiao = piao.split('.')[0].split(';')
    dwz1 = len(pingfen)
    dwz2 = len(zpingfen)
    dwz3 = len(zpiao)
    print(dwz2)
    print(zpingfen)
    pingfen1 = re.sub('[\.]', '', stary).split(';')
    zpingfen1 = re.sub('[\.]', '', starz).split(';')
    zpiao1 = re.sub('[\.]', '', piao).split(';')
    store = getz(pingfen1, ocr, dwz1, dic)
    zstore = getz(zpingfen1, ocr, dwz2, dic)
    pfang = getz(zpiao1, ocr, dwz3, dic)
    return store, zstore, pfang

#获得解码后的数据
def getz(text, ocr, dwz, dic):
    ocr = zhuanhuan(ocr)
    j = 0
    b = []
    a = ''
    for i in text:
        i1 = i.upper().replace('&#X', 'uni')
        if i != '':
            le = dic.index(i1)
            b.append(ocr[le])
        if j == dwz - 2:
            b.append('.')
            print(j)
        j = j + 1
    print(a.join(np.array(b)))


ocr = '5810427369'
path = getwoff(url)
di = getdick(path, ocr)
pf, zpf, piaof = gettext(html, ocr, di)
print(pf, zpf, piaof)

你可能感兴趣的:(爬虫)