爬取酷狗top500

import requests
from bs4 import BeautifulSoup
headers={
        'UserAgent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
def get_songs(url):#网页含有22首歌的网址
    res=requests.get(url,headers=headers)
    soup=BeautifulSoup(res.text,'html.parser')
    ranks=soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_num')
    #所有数字标的列表
    #return ranks
    titles=soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
    #所有标题的列表
    times=soup.select('#rankWrap > div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span')
    #所有播放时长的列表
    total=[]
    for rank,title,time in zip(ranks,titles,times):
        data={'rank':rank.get_text().strip(),\
              'title':title.get_text().strip(),\
              'time':time.get_text().strip()}
        #对应元素打包成一个个字典
        total.append(data)#将一个个字典放进列表中
    return total

'''
URL='http://www.kugou.com/yy/rank/home/2-8888.html?from=rank'
print(get_songs(URL))
'''
'''
for a in get_songs(URL):
    print(a.get_text())
'''
totals=[]
for i in range(1,24):
    urls='http://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format(i)
    totals.append(get_songs(urls))#一页为一个列表,将一个个列表放进列表中
    print(totals)
import pandas
deal1=list(map(pandas.DataFrame,totals))#有序排列

deal2=pandas.concat(deal1)#去除上标
#print(deal2)
deal2.index=list(range(deal2.shape[0]))#连续序号
#print(deal2.index)
deal2.to_excel('kougouTOP500.xls')#保存到excel

你可能感兴趣的:(python爬虫)