Python爬虫 爬取音频文件 #只用于学习

from lxml import etree
import requests
import os
from urllib import request,parse
url = 'https://www.ximalaya.com/lishi/4164479/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url,headers=headers)
# print(response)
html = response.text
html_ele = etree.HTML(html)
mp_list = html_ele.xpath('//ul[@class="dOi2"]/li/div[2]/a/@href')
# print(mp_list)
# 遍历春秋尾部链接
for mp in mp_list:
    # print(mp)
    data = parse.urljoin(url, mp)
    # print(data)
    data_url_str = data.split('/')[-1]
    #音频地址
    data_url = 'https://www.ximalaya.com/revision/play/tracks?trackIds=' + str(data_url_str)
    # print(data_url)
    response = requests.get(data_url, headers=headers)
    # print(response.text)
    # # print(type(response.text))
    # 直接转json类型
    data_str = response.json()
    # print(type(data_str))
    # 获取m4a的地址
    m4a_url = data_str['data']['tracksForAudioPlay'][0]['src']
    m4a_name = data_str['data']['tracksForAudioPlay'][0]['trackName']
    # print(m4a_url)
    # print(m4a_name)
    # 创建down文件夹
    if not os.path.exists('Down'):
        os.mkdir('Down')
    filename = 'Down/' + m4a_name + '.m4a'
    # print(filename)
    #下载
    request.urlretrieve(m4a_url, filename)
    print(m4a_url + m4a_name + '正在下载ding...。')
    print('---' * 50)

你可能感兴趣的:(Python爬虫 爬取音频文件 #只用于学习)