采集网站:Scrape | Movie
采集十页的数据
通过控制台分析:
_0x5da681 '1640505616'
_0x31a891 ['/api/movie', '1640505616']
首先将_0x31a891转为字符串用,拼接,再进行sha1加密
将加密好的值与时间戳用,拼接成字符串
最后进行base64编码,得到token值
获取token值
t = str(int(time.time()))
L = ['/api/movie']
L.append(t)
print(L)
md = hashlib.sha1() #sha1加密
md.update(','.join(L).encode('utf-8'))
has = md.hexdigest()
list = [has, str(t)]
p = ','.join(list)
token = base64.b64encode(p.encode('utf-8')).decode('utf-8')
# ob混淆
import base64
import time,hashlib
import requests
from openpyxl.workbook import Workbook
from concurrent.futures.thread import ThreadPoolExecutor
wb = Workbook()
ws = wb.create_sheet('电影排行榜',index=0)
ws.append(['片名','类型','上映时间','时长','评分','上映地区'])
class Scrape():
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
self.t = str(int(time.time()))
def get_token(self):
'''
模拟网站加密,获取token值
:return:
'''
L = ['/api/movie']
L.append(self.t)
print(L)
md = hashlib.sha1() #sha1加密
md.update(','.join(L).encode('utf-8'))
has = md.hexdigest()
list = [has, str(self.t)]
p = ','.join(list)
token = base64.b64encode(p.encode('utf-8')).decode('utf-8')
return token
def get_data(self,url):
res = requests.get(url,headers=self.headers)
items = res.json()['results']
for i in items:
name = i['name'] + '-' +i['alias']
type = ' '.join(i['categories'])
published_at = i['published_at']
minute = i['minute']
score =i['score']
regions = ' '.join(i['regions'])
ws.append([name,type,published_at,minute,score,regions])
wb.save('电影排行榜.xlsx')
def run(self):
token = self.get_token()
for i in range(0,91,10):
url = 'https://spa6.scrape.center/api/movie/?limit=10&offset=10&token={}'.format(token)
self.get_data(url)
if __name__ == '__main__':
obj = Scrape()
with ThreadPoolExecutor(10) as f:
f.submit(obj.run)