有了 爬取网易云音乐个人动态中的视频(Ⅰ) 和 爬取网易云音乐个人动态中的视频(Ⅱ) 的铺垫, 编写爬虫的代码便显得没那么突出了.
直接show代码!
给出加密的代码
import base64
from Cryptodome.Cipher import AES
import os
import json
import binascii
# 来源: https://blog.csdn.net/tzs_1041218129/article/details/52789153
# 来源: https://github.com/darknessomi/musicbox/blob/master/NEMbox/encrypt.py
# 根据上述两个网站, 对其做了一点点修改, 使其对我代码能生效
__all__ = ['encrypt_data']
MODULUS = ('00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7'
'b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280'
'104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932'
'575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b'
'3ece0462db0a22b8e7')
PUBKEY = '010001'
NONCE = b'0CoJUm6Qyw8W8jud'
def aes(text, key):
pad = 16 - len(text) % 16
text = text + bytearray([pad] * pad)
encryptor = AES.new(key, 2, b'0102030405060708')
ciphertext = encryptor.encrypt(text)
return base64.b64encode(ciphertext)
def rsa(text, pubkey, modulus):
text = text[::-1]
rs = pow(int(binascii.hexlify(text), 16),
int(pubkey, 16), int(modulus, 16))
return format(rs, 'x').zfill(256)
def encrypt_data(dict_data):
"""
text = {
"ids": "[\"12A059550A712E4DDB3013DCDE3C92B4\", \"5B0AF067CBB42F7789F7B97E13827565\"]",
"resolution": "1080",
"csrf_token": ""
}
"""
text = json.dumps(dict_data).encode('utf-8')
secret = binascii.hexlify(os.urandom(16))[:16]
params = aes(aes(text, NONCE), secret)
encSecKey = rsa(secret, PUBKEY, MODULUS)
data = {
"params": params,
"encSecKey": encSecKey
}
return data
接下来是主要的脚本
代码比较粗糙, 其中使用time.sleep方法等待phantomjs加载页面
import requests
import csv
from selenium import webdriver
import time
from encrypt_api import encrypt_data
#############################################
# BEGIN 一些url和http请求头的设置
header = {
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'music.163.com',
'Referer': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36',
}
event_url = 'http://music.163.com/#/user/event?id=343142613'
enent_mv_api_url = 'http://music.163.com/weapi/cloudvideo/playurl'
# END 一些url和http请求头的设置
#############################################
driver = webdriver.PhantomJS(executable_path=r'E:\Study\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(event_url)
# 等待3s, 期望页面能加载成功
time.sleep(3)
# 滚动到页底
js = "document.getElementById('g_iframe').contentWindow.scrollTo(0,9999999)"
driver.execute_script(js)
time.sleep(3)
driver.get_screenshot_as_file('tmp/test_screenshot.png')
driver.switch_to.frame('g_iframe')
event_mv_list = [{'details': i.text, 'id': i.get_attribute('data-vid')}
for i in driver.find_elements_by_css_selector('div.info.f-pa') if i is not None]
# event_mv_list[0]
# {'name': '【耳机体验】3DC音效《BINGBIAN病变》秋仁 - by 自由者音效\n47149\n04:10', 'id': '5B0AF067CBB42F7789F7B97E13827565'}
print('共有%d个视频' % len(event_mv_list))
if len(event_mv_list):
event_mv_list_ids = [i['id'] for i in event_mv_list]
data = {
'ids': str(event_mv_list_ids),
"resolution": "1080",
"csrf_token": ""
}
data = encrypt_data(data)
sess = requests.session()
resp = sess.post(enent_mv_api_url, data=data, headers=header)
if resp.status_code == 200:
# 保存视频url
with open('tmp/mv_urls.txt', 'w') as f:
for each in resp.json()['urls']:
f.write(each['url']+'\n')
# 保存json
with open('tmp/resp_json.txt', 'w') as f:
f.write(resp.text)
# 把视频的信息和json一起保存
with open('tmp/mv_info.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, ['name','like','time','id','url','size','validityTime','r'])
writer.writeheader()
csv_dict_data = resp.json()['urls']
for i, each in enumerate(csv_dict_data):
each['name'], each['like'], each['time'] = event_mv_list[i]['details'].split('\n')
writer.writerow(each)
给出mv_urls.txt的截图, 如下
有了这些url之后, 可以采用下载工具去进行下载, 诸如IDM等等多线程的下载工具都很不错! 但是需要注意的是, 这样下载下来的文件可能文件名并不是你在网页上面看到的MV名, 所以可能需要配合使用mv_info.csv来对其进行重命名操作. 这是不难的.
以上代码存放在 GetCloudMusicVideoOnEvent