以某云为例,其实不太想写出来,用多了可能不灵了。
构造header
因为是通过网页版抓取
r = requests.Session()
r.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
获取歌曲id
以下代码是获取特定歌手的热门歌曲id,歌手的id从网址上看。
def get_songs(singer_id=6470):
url = 'https://music.163.com/artist?id={}'.format(singer_id)
rc = r.get(url)
if rc.status_code == 200:
html = bs4.BeautifulSoup(rc.text, 'lxml').select('ul.f-hide li a')
names = []
ids = []
for i in html:
names.append(i.getText())
ids.append(re.search('id=([0-9]+)', str(i)).group(1))
print(names)
print(ids)
return zip(ids,names)
获取歌词
根据歌曲id获取,这里重新建立了requests,估计用之前的Session也没问题。
def get_lyric(song_id=27904290):
url = 'https://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(song_id)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
print('== 下载{}的歌词...'.format(song_id))
# rc = r.get(url)
rc = requests.get(url, headers=headers)
if rc.status_code == 200:
result = rc.json()
if result.get('uncollected', False) or result.get('nolyric', False):
print('暂无歌词')
return ''
lyric = rc.json()['lrc']['lyric']
reg = re.compile(r'\[.*\]')
lyric = re.sub(reg, '', lyric).strip()
return lyric
循环抓取
间隔给个5到15秒,还可以。给出歌手列表,就循环把所有热门歌曲歌词抓取并保存。
if __name__ == '__main__':
singer_list = [4950,4406]
for singer in singer_list:
dir_name = singer_dict[str(singer)] # 歌手目录
if not os.path.exists(dir_name):
os.mkdir(dir_name)
songs = get_songs(singer)
for sid, sname in songs:
sname = sname.replace('/', '-') # 防止出错
if os.path.exists('{}/{}.txt'.format(dir_name, sname)):
print('歌曲已有,跳过')
continue
lyric = get_lyric(sid)
if lyric:
with open('{}/{}.txt'.format(dir_name, sname), 'w+', encoding='utf-8') as f:
f.write(lyric)
f.close()
print('歌曲:{} 已经保存完毕'.format(sname))
time.sleep(random.randint(5,15))
参考
主要是这篇Python 爬虫获取网易云音乐歌手歌词