爬虫作业2(酷狗音乐)

import requests

from lxmlimport etree

import csv

import re

import json

headers = {

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',

'cookie':'kg_mid=***'

}

def get_info(url, writer):

res = requests.get(url,headers=headers)

html = etree.HTML(res.text)

infos = html.xpath('//div[@class="pc_temp_songlist  pc_rank_songlist_short"]/ul/li')

for infoin infos:

rank1 = info.xpath('span[3]')[0]

rank = rank1.xpath('string(.)').strip()

name = info.xpath('a/text()')[0]

singer = name.split('-')[0]

song = name.split('-')[1]

time = info.xpath('span[5]/span/text()')[0].strip()

url_link = info.xpath('a/@href')[0]

res1 = requests.get(url_link,headers=headers)

for linein res1.text.split('\r'):

if 'jQuery' in line:

print(line)

if 'dataFromSmarty' in line:

hash = re.findall('"hash":"(.*?)",', line, re.S)[0]

album_id = re.findall('"album_id":(.*?)}', line, re.S)[0]

url_index ='https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery19106328788476737324_1563785427610&hash={}&album_id={}'.format(

hash, album_id)

res2 = requests.get(url_index,headers=headers)

json_data = json.loads(re.match(".*?({.*}).*", res2.text).group(1))

# pprint.pprint(json_data)

                play_url = json_data['data']['play_url']

print(rank, singer, song, time, play_url)

writer.writerow([rank, singer, song, time, play_url])

if __name__ =='__main__':

f =open('song.csv','w+',encoding='utf-8',newline='')

writer = csv.writer(f)

writer.writerow(['rank','singer','song','time','play_url'])

urls = ['https://www.kugou.com/yy/rank/home/{}-6666.html?from=rank'.format(str(i))for iin range(1,6)]

for urlin urls:

get_info(url, writer)

f.close()

你可能感兴趣的:(爬虫作业2(酷狗音乐))