爬虫day5 易中天品三国音频爬取

# 易中天品三国音频爬取


import requests
from lxml import etree
from urllib import parse,request
base_url = 'https://www.ximalaya.com/lishi/13396678/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

response = requests.get(base_url,headers=headers)
html_ele = etree.HTML(response.text)
re_p1 = '//*[@id="root"]/main/section/div/div[2]/div[1]/div[2]/div[2]/div/nav/ul/li[1]/a/@href'

re_p = '//*[@id="root"]/main/section/div/div[2]/div[1]/div[2]/div[2]/div/nav/ul/li[@class="Yetd page-item"]/a/@href'
href_list = []
href_list1 = html_ele.xpath(re_p1)
href_list2 = html_ele.xpath(re_p)

href_list += href_list1

href_list += href_list2

for href in href_list:
    base_url = request.urljoin(base_url,href)

    response = requests.get(base_url, headers=headers)
    # print(response.text)
    # xpath = '//*[@id="root"]/main/section/div/div[2]/div[1]/div[2]/div[2]/ul/li[1]/div[2]/a'
    re_ = '//ul[@class="dOi2"]/li/div[2]/a/@href'

    html_ele = etree.HTML(response.text)

    href_list = html_ele.xpath(re_)

    for href in href_list:
        url = parse.urljoin(base_url, href)
        print(url)
        detail_url = 'https://www.ximalaya.com/revision/play/tracks?trackIds=' + url.split('/')[-1]
        response = requests.get(detail_url, headers=headers)
        res_dict = response.json()

        url_src = res_dict['data']['tracksForAudioPlay'][0]['src']
        trackName = res_dict['data']['tracksForAudioPlay'][0]['trackName']
        request.urlretrieve(url_src, trackName + '.m4a')

 

你可能感兴趣的:(爬虫day5 易中天品三国音频爬取)