Python入门之爬取百度音乐
先说一下为什么会有这篇文章,首先肯定是有这个需求了,本人出差在外地,这里的网速卡到爆,根本支撑不了在线听歌的要求,所以就想下载到本地来慢慢听。这可是python的绝活,最近对Python爬虫很有兴趣,于是想到以前写过Q他版本的,拿来一试接口竟然还可以用,具体音乐信息的获取流程见上一篇简单几步实现网络音乐播放器(Qt版百度FM)中的分析。
这里不得不感叹一句:Life is short, you need Python! 简单优雅到你试一次后就会爱上它,对于爬取个神马美女图片啦顺手捻来,不多废话了,直接上代码才100来行,代码只做了简单的注释
import json
import threading
#from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlopen,Request
import socket
socket.setdefaulttimeout(10)
#http://fm.baidu.com/dev/api/?tn=channellist
def get_channel_list(page_url):
try:
htmlDoc = urlopen(page_url).read().decode('utf8')
except:
return {}
with open("./channle.json", mode = 'w', encoding = 'utf-8') as file:
file.write(htmlDoc)
file = open('channle.json')
content = json.load(file)
channel_list = content['channel_list']
for channel in channel_list:
print(channel['channel_name'])
return channel_list
def get_song_list(channel_url):
try:
htmlDoc = urlopen(channel_url).read().decode('utf8')
except:
return{}
with open("./songs.json", mode = 'w', encoding = 'utf-8') as file:
file.write(htmlDoc)
file = open('songs.json')
content = json.load(file)
song_id_list = content['list']
#for song in song_id_list:
# print(song)
return song_id_list
def get_song_real_url(song_url):
try:
htmlDoc = urlopen(song_url).read().decode('utf8')
#print(htmlDoc)
except:
return(None, None, 0)
with open("./song.json", mode = 'w', encoding = 'utf-8') as file:
file.write(htmlDoc)
file = open('song.json')
content = json.load(file)
#print(content['data']['songList'])
try:
song_link = content['data']['songList'][0]['songLink']
song_name = content['data']['songList'][0]['songName']
song_size = int(content['data']['songList'][0]['size'])
except:
print('get real link failed')
return(None, None, 0)
#print(song_name + ':' + song_link)
return song_name, song_link, song_size
def donwn_mp3_by_link(song_link, song_name, song_size):
file_name = song_name + ".mp3"
base_dir = os.path.dirname(__file__)
file_full_path = os.path.join(base_dir, file_name)
if os.path.exists(file_full_path):
return
print("begin DownLoad %s, size = %d" % (song_name, song_size))
mp3 = urlopen(song_link)
block_size = 8192
down_loaded_size = 0
file = open(file_full_path, "wb")
while True:
try:
buffer = mp3.read(block_size)
down_loaded_size += len(buffer)
if(len(buffer) == 0):
if down_loaded_size < song_size:
if os.path.exists(file_full_path):
os.remove(file_full_path)
print('download time out, file deleted')
with open('log.txt', 'a') as log_file:
log_file.write("time out rm %s\n" % file_name)
break
print('%s %d of %d' % (song_name, down_loaded_size, song_size))
file.write(buffer)
if down_loaded_size >= song_size:
print('%s download finshed' % file_full_path)
break
except:
if os.path.getsize(file_full_path) < song_size:
if os.path.exists(file_full_path):
os.remove(file_full_path)
print('download time out, file deleted')
with open('log.txt', 'a') as log_file:
log_file.write("time out rm %s\n" % file_name)
break
file.close()
def downViaMutiThread(song_info_list):
task_threads = [] #存储线程
for song_name, song_link, song_size in song_info_list:
t = threading.Thread(target = donwn_mp3_by_link, args = (song_link, song_name, song_size))
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
if __name__ == '__main__':
# 第一步,获取频道列表channel
page_url = 'http://fm.baidu.com/dev/api/?tn=channellist'
channel_list = get_channel_list(page_url)
while True:
#第二步,获取某个频道列表下的所有歌曲
#get all song's id in one channel
channel_url = 'http://fm.baidu.com/dev/api/?tn=playlist&format=json&id=%s' % 'public_yuzhong_yueyu'
song_id_list = get_song_list(channel_url)
#第三步,获取该歌曲的所有信息
#get song real url
#song_info = {}
song_info_list = []
for song_id in song_id_list:
#print(song_id['id'])
song_url = "http://music.baidu.com/data/music/fmlink?type=mp3&rate=320&songIds=%s" % song_id['id']
song_name, song_link, song_size = get_song_real_url(song_url)
if song_size != 0:
#song_info[song_name] = song_link
#song_info = (song_name, song_link, song_size)
#song_info_list.append(song_info)
#single thread way
#最后下载歌曲
donwn_mp3_by_link(song_link, song_name, song_size)
#downViaMutiThread(song_info_list)