获取原始数据:3e983e35f4c5ff7f92fe911dd2273ca52280.woff
获取网址:http://vfile.meituan.net/colorstone/3e983e35f4c5ff7f92fe911dd2273ca52280.woff
from urllib import request
from fake_useragent import UserAgent
import re
import numpy as np
import requests
import os
from fontTools.ttLib import TTFont
ur = 'https://maoyan.com/films/1212'
headers = {
'Connection': 'keep-alive',
'Cookie': '__mta=107330365.1581993476301.1581995900678.1581995940325.4; uuid_n_v=v1; uuid=49E548F0517811EA93B3D74AE6BFD56770454DC8800844C697BB6401A76C066B; _csrf=e24b3af274802a2ed30dba2785f8b4ab91ca3faa823f5c790be4d0de09a4434c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1581993418; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1581995940; _lxsdk_cuid=1705627dd149-08a7621bd0fda88-4c302879-100200-1705627dd168f; _lxsdk=49E548F0517811EA93B3D74AE6BFD56770454DC8800844C697BB6401A76C066B; mojo-uuid=60b57ad9f85813b79216caa9040af2b8; _lxsdk_s=170564d03d9-916-68-dd%7C%7C11; mojo-trace-id=5; mojo-session-id={"id":"0227b873553f93f75b3f57c74319e8a5","time":1581995856552}; __mta=47029656.1581995936050.1581995936050.1581995936050.1',
'User-Agent': UserAgent().random
}
# 获得html
def d_html(url, headers):
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
return html
html = d_html(ur, headers).decode('utf-8')
url = 'https:' + re.findall('//[\w]+.[^\s]+.woff', html)[0]
# 获得woff文件
def getwoff(url):
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
woff_dir = os.path.join(base_dir, r"D:\PycharmProjects\untitled\爬虫/")
file_name = url.split("/")[-1]
save_woff = os.path.join(woff_dir, file_name)
resp = requests.get(url=url)
with open(save_woff, "wb") as f:
f.write(resp.content)
f.close()
path = woff_dir + file_name
return path
#获得数字(ocr)对应的编码
def getdick(path, ocr):
font1 = TTFont(r'D:\PycharmProjects\untitled\爬虫\3e983e35f4c5ff7f92fe911dd2273ca52280.woff') # 打开本地字体文件01.ttf
uni_list1 = font1.getGlyphOrder()[2:] # 获取所有编码,去除前2个
print(uni_list1)
font2 = TTFont(path) # 打开访问网页新获得的字体文件02.ttf
uni_list2 = font2.getGlyphOrder()[2:]
b = []
c = []
for uni1, code in zip(uni_list1, ocr):
obj1 = font1['glyf'][uni1] # 获取编码uni2在02.ttf中对应的对象
b.append(code)
c.append(obj1.coordinates)
a5 = 0
a8 = 0
a1 = 0
a2 = 0
a3 = 0
a4 = 0
a6 = 0
a7 = 0
a9 = 0
a0 = 0
b5 = []
for uni2 in uni_list2:
e5 = 0
e8 = 0
e0 = 0
e1 = 0
e2 = 0
e3 = 0
e4 = 0
e6 = 0
e7 = 0
e9 = 0
obj2 = font2['glyf'][uni2] # 获取编码uni2在02.ttf中对应的对象
b = obj2.flags
for i in c[0]:
if i in obj2.coordinates:
e0 = e0 + 1
if e0 > 3:
a0 = uni2
for j in c[1]:
if j in obj2.coordinates:
e1 = e1 + 1
if e1 > 3:
a1 = uni2
for j in c[3]:
if j in obj2.coordinates:
e3 = e3 + 1
if e3 > 3:
a3 = uni2
if np.mean(b) == 1:
a4 = uni2
for i in c[5]:
if i in obj2.coordinates:
e5 = e5 + 1
if e5 > 3:
a5 = uni2
for j in c[6]:
if j in obj2.coordinates:
e6 = e6 + 1
if e6 > 3:
a6 = uni2
for i in c[7]:
if i in obj2.coordinates:
e7 = e7 + 1
if e7 > 3:
a7 = uni2
for i in c[8]:
if i in obj2.coordinates:
e8 = e8 + 1
if e8 > 3:
a8 = uni2
for j in c[9]:
if j in obj2.coordinates:
e9 = e9 + 1
if e9 > 3:
a9 = uni2
b5.append(a0)
b5.append(a1)
b5.append(a2)
b5.append(a3)
b5.append(a4)
b5.append(a5)
b5.append(a6)
b5.append(a7)
b5.append(a8)
b5.append(a9)
# 确定了8个剩下那个就是1对应的编码
for i in uni_list2:
if i not in b5:
j = 0
while j < len(b5):
if b5[j] == 0:
b5[j] = i
j = j + 1
print(b5)
return b5
# 字符串转化为数组
def zhuanhuan(ocr):
ocr1 = []
for i in list(ocr):
ocr1.append(i)
return ocr1
#获得原始数据(未解码之前)
def gettext(html, ocr, dic):
str = ""
str1 = '(.*?)万人评分'
stary = re.findall(r'\s+(.*?)\s+', html)[0]
starz = re.findall(str + str1, html)[0]
piao = re.findall(r'(.*?)亿', html)[0]
pingfen = stary.split('.')[0].split(';')
zpingfen = starz.split('.')[0].split(';')
zpiao = piao.split('.')[0].split(';')
dwz1 = len(pingfen)
dwz2 = len(zpingfen)
dwz3 = len(zpiao)
print(dwz2)
print(zpingfen)
pingfen1 = re.sub('[\.]', '', stary).split(';')
zpingfen1 = re.sub('[\.]', '', starz).split(';')
zpiao1 = re.sub('[\.]', '', piao).split(';')
store = getz(pingfen1, ocr, dwz1, dic)
zstore = getz(zpingfen1, ocr, dwz2, dic)
pfang = getz(zpiao1, ocr, dwz3, dic)
return store, zstore, pfang
#获得解码后的数据
def getz(text, ocr, dwz, dic):
ocr = zhuanhuan(ocr)
j = 0
b = []
a = ''
for i in text:
i1 = i.upper().replace('', 'uni')
if i != '':
le = dic.index(i1)
b.append(ocr[le])
if j == dwz - 2:
b.append('.')
print(j)
j = j + 1
print(a.join(np.array(b)))
ocr = '5810427369'
path = getwoff(url)
di = getdick(path, ocr)
pf, zpf, piaof = gettext(html, ocr, di)
print(pf, zpf, piaof)