在做过58同城的字体反爬过后,信心稍微增长点,那么索性找点字体文件反爬的网址,猫眼是客户要求
的网址,那么便开始搞起来。
猫眼经典电影
https://maoyan.com/films?sourceId=2&yearId=15&showType=3&offset=0
很明显和汽车之家的论坛是字体的加密,那么按照之前的步骤走:
第一步:找到字体文件,下载下来。
第二步:通过Font Creator工具读取下载好的字体文件。
第三步:按顺序拿到各个字符的unicode编码,并写出对应的文字列表。
第四步:将顺序unicode编码写成对应的unicode的类型。
第五步:替换掉文章中对应的unicode的类型的文字。
按照上边的简单的步骤一顿乱搞,发现最后更本就不对,这点作者是深有体会,不禁想这是为什么?想了好久也没有想太明白,那么只有去网上找资源,还好关于猫眼的爬虫不少,作者这里参考了他们的思路:
采集的2019、2018、2017的电影信息,字段含电影名、上演时间、上演地区、市场、评分、票房、评价总人次(本次源码里无)、演员,其余字段就靠大家积极补充了。(个人能力有限及时间有限)
什么猫眼不同于汽车之家呢?这里最重要的一点是,猫眼这里的字体是多个字体文件,每次访问会给你返回不同的字体的文件,这也就是为什么对不上字体的原因。索性访问两次下载两套字体进行对比,如下图:
可以发现两次虽然是同一个文字但是字符编码发生了变化,完全和第一个不一样了,这个时候怎么解决呢?不禁考虑虽然对应的字符编码是改变的,那么同一个字体的结构会发生变化吗?(当然这是后话,以后自己可以往这个方向考虑),事实可以通过查看两个个字体文件内部数据结构,方法如下:
先手动查看源码找到对应的文件直接下载,如下图:
本次作者下载woff地址是:vfile.meituan.net/colorstone/c4e1abb1a992f1f2bf87821129b3a40d2280.woff
from fontTools.ttLib import TTFont
font = TTFont('test.woff') # 打开本地字体文件online_base64.ttf
font.saveXML('test.xml') # 将ttf文件转化成xml格式并保存到本地,主要是方便我们查看内部数据结构
打开test.xml文件可以看到类似html标签的结构.这里我们用到的标签是
那么看一下
在看一下
没时间了,先上源码莫嫌弃(后期在慢慢完善):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : cj.py
# @Author: itlaolang
# @Date : 2020-01-13 04:39
# @Contact : [email protected]
# @Software : PyCharm
# @Desc :描述
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : caiji.py
# @Author: itlaolang
# @Date : 2020-01-07 16:51
# @Contact : [email protected]
# @Software : PyCharm
# @Desc :采集猫眼电影
import os
import json
from xml.etree import ElementTree as ET
# r = ET.parse('web.xml')
# print(r)
from xml.dom.minidom import parse
import xmltodict
from fontTools.ttLib import TTFont
def printtree(p):
if p.hasChildNodes()==0:
return
else:
for x in p.childNodes:
printtree(x)
def th(xmldoc1):
xmldoc = parse(xmldoc1)
text = json.dumps(xmltodict.parse(xmldoc.toxml()), indent=4)
# print(text)
text_json = json.loads(text)
list = []
list1 = []
num = 0
for k, v in text_json.get('ttFont').items():
if k == 'cmap':
list = v.get('cmap_format_4')
if k == 'glyf':
list1 = v.get('TTGlyph')
ysbm = {}
for i in list:
if i['@platEncID'] == '3':
num1 = 0
for ii in i['map']:
if num1 != 0 :
ysbm[ii['@name']] = ii['@code'][1:]
num1 = num1 + 1
print(ysbm)
temp1={}
num2 = 0
for i in list1:
# print(i['@name'])
if num2 != 0 :
for k, v in i.items():
ybm = ''
data1=[]
if k == 'contour':
if type(v)==dict:
for iiii in v.get('pt'):
ybm = ybm+json.dumps(iiii)
data1.append(iiii)
# iiii print(json.dumps(iiii))
else:
for iiii in v:
for iiii in iiii.get('pt'):
ybm = ybm+json.dumps(iiii)
data1.append(iiii)
# print('))))))))')
# print(k)
# print(ybm)
temp1[i['@name']]=data1
num2 = num2 + 1
# print(temp1)
data1={}
data2={}
for k,v in ysbm.items():
data2[v]=temp1[k]
# print(data2)
return data2
def test():
xmldoc1 = parse("test.xml")
text = json.dumps(xmltodict.parse(xmldoc1.toxml()), indent=4)
text_json = json.loads(text)
list = []
list1 = []
num = 0
for k, v in text_json.get('ttFont').items():
if k == 'cmap':
list = v.get('cmap_format_4')
if k == 'glyf':
list1 = v.get('TTGlyph')
ysbm = {}
for i in list:
if i['@platEncID'] == '3':
num1 = 0
for ii in i['map']:
if num1 != 0:
ysbm[ii['@name']] = ii['@code'][1:]
num1 = num1 + 1
temp1 = {}
num2 = 0
for i in list1:
# print(i['@name'])
if num2 != 0:
for k, v in i.items():
ybm = ''
data1 = []
if k == 'contour':
if type(v) == dict:
for iiii in v.get('pt'):
ybm = ybm + json.dumps(iiii)
data1.append(iiii)
# iiii print(json.dumps(iiii))
else:
for iiii in v:
for iiii in iiii.get('pt'):
ybm = ybm + json.dumps(iiii)
data1.append(iiii)
# print('))))))))')
# print(k)
# print(ybm)
temp1[i['@name']] = data1
num2 = num2 + 1
# print(temp1)
data1 = {}
data2 = {}
j=["3","1","0","5","2","6","7","9","8","4"]
numb=0
for k, v in ysbm.items():
data2[j[numb]] = temp1[k]
numb=numb+1
# print(data2)
return data2
def tihuan(s,ss):
data={}
for k,v in s.items():
# print(v)
for i in v:
for k1,v1 in ss.items():
for ii in v1:
if i==ii:
data[k]=k1
break
print('______________')
print(data)
return data
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
def xie(aa):
b = [aa[0],aa[1], aa[2], aa[3],aa[4],aa[5],aa[6]]
c = []
c.append(b)
with open("rul18.csv", "a+") as csvfile:
writer = csv.writer(csvfile)
# 先写入columns_name
# writer.writerow(["index", "a_name", "b_name"])
# 写入多行用writerows
writer.writerows(c)
heard={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Cookie':'__mta=252426550.1578386784263.1578880752527.1578880757029.106; uuid_n_v=v1; uuid=305C6350312A11EAAFF74B96B91393FE697BD1F626FD454B9C9BE17C4573B069; _csrf=912fb7d87b6b2be10fa6c006c7f8ff3dca2e8046d150df287ae7b43373a62fe9; _lxsdk_cuid=16f7f2efd26c8-06db984d1a3a07-1d376b5b-13c680-16f7f2efd26c8; mojo-uuid=3628b7cca1fbe628e02e7217d561824a; lt=uWHePaJz08QdlIPJYXQc4EwhOp8AAAAAyQkAAHvmgHwm3_XWExp-YRStdbxmhB4H3lUFAlMqeZT2cVPczJr6q_RBKT-rbRuf6-2kdg; lt.sig=DWBPDQrL5UqBOS-B49b5VJfzr7c; theme=moviepro; _lxsdk=305C6350312A11EAAFF74B96B91393FE697BD1F626FD454B9C9BE17C4573B069; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1578663920,1578668029,1578827974,1578855616; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"1cb15fec3630c1c5d56b4c9512f97fa1","time":1578880176906}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578880909; __mta=252426550.1578386784263.1578880757029.1578880909045.107; mojo-trace-id=77; _lxsdk_s=16f9c978a6c-233-4ff-215%7C2796253285%7C102'
}
def xq(xqurl):
# xqurl='https://maoyan.com/films/1279731'
html = requests.get(xqurl, headers=heard).text
# print(html.find("'woff'"))
# print(html[html.find("embedded-opentype")+39:html.find("'woff'")-10])
woff=html[html.find("embedded-opentype")+39:html.find("'woff'")-10]
wofflink = 'http://' + woff
r = requests.get(wofflink, headers=heard)
with open('tt.woff', "wb") as f:
f.write(r.content)
f.close()
font1 = TTFont('tt.woff') # 读取woff文件
font1.saveXML('tt.xml') # 转成xml
data=tihuan(th('tt.xml'),test())
#
for k ,v in data.items():
print(k,v)
html=html.replace(""+k+";",v)
import time
# time.sleep(600)
htmltxet = BeautifulSoup(html, "html.parser")
pianming=htmltxet.find("h3",attrs={"class":"name"}).text.strip()
leixing=htmltxet.find_all("li",attrs={"class":"ellipsis"})[0].text.strip()
shany = htmltxet.find_all("li", attrs={"class": "ellipsis"})[1].text.strip()
dalu = htmltxet.find_all("li", attrs={"class": "ellipsis"})[2].text.strip()
print(pianming)
print(leixing,shany,dalu)
# pingfen=re.search('.')
if htmltxet.find("span", attrs={"class": "stonefont"}) != None:
pingfen1 = htmltxet.find("span", attrs={"class": "stonefont"}).text
else:
pingfen1 = '暂无'
if htmltxet.find("div",attrs={"class":"movie-index-content box"}) != None:
pioafang=htmltxet.find("div",attrs={"class":"movie-index-content box"}).text
else:
pioafang = '暂无'
print(pioafang)
print(pingfen1)
yy=''
for i in htmltxet.find_all("li",attrs={"class":"celebrity actor"}):
print(i.find('a',attrs={"class":"name"}).text.strip())
yy=yy+","+i.find('a',attrs={"class":"name"}).text.strip()
print("44$$$$$$$$$$$$$$")
data=[]
data.append(pianming)
data.append(leixing)
data.append(shany)
data.append(dalu)
data.append(pingfen1)
data.append(pioafang)
data.append(yy)
xie(data)
d=0
for ii in range(6,68):
print(ii)
url="https://maoyan.com/films?sourceId=2&yearId=13&showType=3&sortId=3&offset="+str(ii*30)
html=requests.get(url,headers=heard).text
htmltxet = BeautifulSoup(html, "html.parser")
for i in htmltxet.find_all("div",attrs={"class":"movie-item"}):
# print(i.find('a').get('href'))
xqurl="https://maoyan.com"+i.find('a').get('href')
d=d+1
print("第{0}个电影链接:{1}".format(d,xqurl))
xq(xqurl)
# time.sleep(4)
# print(htmltxet)
源码文件下载链接:
https://download.csdn.net/download/qq_31151511/12099755