import requests
from selenium import webdriver
import re
import json
from queue import Queue
import threading
from selenium.webdriver.chrome.options import Options
class Music:
def __init__(self):
chrome_options = Options()
# 设置chrome浏览器无界面模式
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
self.driver_category = webdriver.Chrome(chrome_options=chrome_options)
self.driver_sheets = webdriver.Chrome(chrome_options=chrome_options)
self.driver_songs = webdriver.Chrome(chrome_options=chrome_options)
self.start_url = "https://music.163.com/#/discover/playlist"
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36"}
self.part_url = "http://music.163.com/song/media/outer/url?id={}.mp3"
# 歌曲url
self.song_url_queue = Queue()
# 歌曲name
self.name_queue = Queue()
# 类别url
self.category_url_queue = Queue()
# 歌单url
self.song_sheet_queue = Queue()
def __del__(self):
self.driver_category.quit()
self.driver_sheets.quit()
self.driver_songs.quit()
def parse_category_url(self,url):
self.driver_category.get(url)
self.driver_category.switch_to_default_content()
frame = self.driver_category.find_elements_by_tag_name('iframe')[0]
self.driver_category.switch_to_frame(frame)
def parse_sheets_url(self,url):
self.driver_sheets.get(url)
self.driver_sheets.switch_to_default_content()
frame = self.driver_sheets.find_elements_by_tag_name('iframe')[0]
self.driver_sheets.switch_to_frame(frame)
def parse_songs_url(self,url):
self.driver_songs.get(url)
self.driver_songs.switch_to_default_content()
frame = self.driver_songs.find_elements_by_tag_name('iframe')[0]
self.driver_songs.switch_to_frame(frame)
def save_content(self,content_list):
# for content in content_list:
# print(content)
with open("music_163.json", "w", encoding="utf-8") as f:
json.dump(content_list, f, ensure_ascii=False, indent=2)
def get_category_list(self):
#
self.parse_category_url(self.start_url)
div = self.driver_category.find_element_by_id("cateListBox").find_element_by_class_name("bd")
# 获取五个分类
dl_list = div.find_elements_by_class_name("f-cb")
for dl in dl_list:
dl_item = {}
# 获取每个分类下面的 子分类
a_list = dl.find_elements_by_class_name("s-fc1 ")
#print("a_list", len(a_list))
cate = dl.find_element_by_tag_name("dt").get_attribute("outerHTML")
cate = re.findall(r'[\u4e00-\u9fa5]',cate)
cate = "".join(cate)
dl_item["cate"] = cate
dl_item["names"] = []
for a in a_list:
item = {}
item['href'] = a.get_attribute("href")
# 获取分类
self.category_url_queue.put(item['href'])
# 根据歌单找歌曲
def songs_by_sheet(self):
while True:
try:
url = self.song_sheet_queue.get()
self.parse_songs_url(url)
tr_list = self.driver_songs.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
for tr in tr_list:
item = {}
song_id = tr.find_element_by_class_name("f-cb").find_element_by_class_name(
"txt").find_element_by_tag_name("a").get_attribute("href")
song_name = tr.find_element_by_class_name("f-cb").find_element_by_class_name(
"txt").find_element_by_tag_name("a").find_element_by_tag_name("b").get_attribute("title")
song_time = tr.find_element_by_class_name(" s-fc3").find_element_by_class_name("u-dur ").text
song_singer = tr.find_elements_by_tag_name("td")[3].find_element_by_class_name(
"text").get_attribute("title")
song_Album = tr.find_elements_by_tag_name("td")[4].find_element_by_tag_name("a").get_attribute(
"title")
song_id = re.match(r'.*=([0-9]*)', song_id)
song_id = song_id.group(1)
item['song_url'] = self.part_url.format(song_id)
item['song_name'] = song_name.strip()
item['song_time'] = song_time
item['song_singer'] = song_singer
item['song_Album'] = song_Album
# 放入Queue
self.song_url_queue.put(item['song_url'])
self.name_queue.put(item['song_name'])
except Exception as e:
pass
self.song_sheet_queue.task_done()
# 根据类别找 歌单
def sheets_by_category(self):
while True:
try:
url = self.category_url_queue.get()
next_url = url
while True:
# 解析每一页的url
self.parse_sheets_url(next_url)
# 拿到列表
ul_list = self.driver_sheets.find_element_by_tag_name("ul").find_elements_by_tag_name("li")
for i in ul_list:
song = {}
# 获取值
song['song_url'] = i.find_element_by_tag_name("a").get_attribute("href")
self.song_sheet_queue.put(song['song_url'])
# 获取下一页
next_url = self.driver_sheets.find_elements_by_xpath("//a[@class='zbtn znxt']")
next_url = next_url[0].get_attribute("href") if len(next_url) > 0 else None
if next_url is None:
break
# page -= 1
except Exception as e:
pass
self.category_url_queue.task_done()
def download(self):
while True:
try:
url = self.song_url_queue.get()
name = self.name_queue.get()
print('正在下载:{}。。。'.format(name))
response = requests.get(url).content
f = open('/home/python/workspace/spider/mp3/{}.mp3'.format(name), 'wb')
f.write(response)
f.close()
except Exception as e:
pass
self.song_url_queue.task_done()
self.name_queue.task_done()
def run(self):
# 构建url
# 发送请求
thread_list = []
# 提取分类数据
# t_category = threading.Thread(target=self.get_category_list)
# thread_list.append(t_category)
self.get_category_list()
# 根据类别获取歌单
t_sheets = threading.Thread(target=self.sheets_by_category)
thread_list.append(t_sheets)
t_song = threading.Thread(target=self.songs_by_sheet)
thread_list.append(t_song)
# 5个线程 主要用来下载歌曲
for i in range(5):
down = threading.Thread(target=self.download)
thread_list.append(down)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护线程
t.start()
for q in [self.song_url_queue,self.name_queue,self.song_sheet_queue,self.category_url_queue]:
q.join()
if __name__ == '__main__':
music = Music()
music.run()