使用selenium爬取网易云音乐

import requests
from selenium import webdriver
import re
import json
from queue import Queue
import threading

from selenium.webdriver.chrome.options import Options


class Music:

    def __init__(self):
        chrome_options = Options()
        # 设置chrome浏览器无界面模式
        chrome_options.add_argument('--headless')
        browser = webdriver.Chrome(chrome_options=chrome_options)

        self.driver_category = webdriver.Chrome(chrome_options=chrome_options)
        self.driver_sheets = webdriver.Chrome(chrome_options=chrome_options)
        self.driver_songs = webdriver.Chrome(chrome_options=chrome_options)

        self.start_url = "https://music.163.com/#/discover/playlist"
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36"}
        self.part_url = "http://music.163.com/song/media/outer/url?id={}.mp3"
        # 歌曲url
        self.song_url_queue = Queue()
        # 歌曲name
        self.name_queue = Queue()
        # 类别url
        self.category_url_queue = Queue()
        # 歌单url
        self.song_sheet_queue = Queue()

    def __del__(self):
        self.driver_category.quit()
        self.driver_sheets.quit()
        self.driver_songs.quit()

    def parse_category_url(self,url):
        self.driver_category.get(url)
        self.driver_category.switch_to_default_content()
        frame = self.driver_category.find_elements_by_tag_name('iframe')[0]
        self.driver_category.switch_to_frame(frame)

    def parse_sheets_url(self,url):
        self.driver_sheets.get(url)
        self.driver_sheets.switch_to_default_content()
        frame = self.driver_sheets.find_elements_by_tag_name('iframe')[0]
        self.driver_sheets.switch_to_frame(frame)

    def parse_songs_url(self,url):
        self.driver_songs.get(url)
        self.driver_songs.switch_to_default_content()
        frame = self.driver_songs.find_elements_by_tag_name('iframe')[0]
        self.driver_songs.switch_to_frame(frame)

    def save_content(self,content_list):
        # for content in content_list:
        #     print(content)
        with open("music_163.json", "w", encoding="utf-8") as f:
            json.dump(content_list, f, ensure_ascii=False, indent=2)


    def get_category_list(self):
        #
        self.parse_category_url(self.start_url)

        div = self.driver_category.find_element_by_id("cateListBox").find_element_by_class_name("bd")
        # 获取五个分类
        dl_list = div.find_elements_by_class_name("f-cb")
        for dl in dl_list:
            dl_item = {}
            # 获取每个分类下面的 子分类
            a_list = dl.find_elements_by_class_name("s-fc1 ")
            #print("a_list", len(a_list))
            cate = dl.find_element_by_tag_name("dt").get_attribute("outerHTML")
            cate = re.findall(r'[\u4e00-\u9fa5]',cate)
            cate = "".join(cate)
            dl_item["cate"] = cate
            dl_item["names"] = []
            for a in a_list:
                item = {}
                item['href'] = a.get_attribute("href")
                # 获取分类
                self.category_url_queue.put(item['href'])

    # 根据歌单找歌曲
    def songs_by_sheet(self):
        while True:
            try:
                url = self.song_sheet_queue.get()
                self.parse_songs_url(url)
                tr_list = self.driver_songs.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
                for tr in tr_list:
                    item = {}
                    song_id = tr.find_element_by_class_name("f-cb").find_element_by_class_name(
                        "txt").find_element_by_tag_name("a").get_attribute("href")
                    song_name = tr.find_element_by_class_name("f-cb").find_element_by_class_name(
                        "txt").find_element_by_tag_name("a").find_element_by_tag_name("b").get_attribute("title")
                    song_time = tr.find_element_by_class_name(" s-fc3").find_element_by_class_name("u-dur ").text
                    song_singer = tr.find_elements_by_tag_name("td")[3].find_element_by_class_name(
                        "text").get_attribute("title")
                    song_Album = tr.find_elements_by_tag_name("td")[4].find_element_by_tag_name("a").get_attribute(
                        "title")
                    song_id = re.match(r'.*=([0-9]*)', song_id)
                    song_id = song_id.group(1)
                    item['song_url'] = self.part_url.format(song_id)
                    item['song_name'] = song_name.strip()
                    item['song_time'] = song_time
                    item['song_singer'] = song_singer
                    item['song_Album'] = song_Album
                    # 放入Queue
                    self.song_url_queue.put(item['song_url'])
                    self.name_queue.put(item['song_name'])
            except Exception as e:
                pass
            self.song_sheet_queue.task_done()


    # 根据类别找 歌单
    def sheets_by_category(self):
        while True:
            try:
                url = self.category_url_queue.get()

                next_url = url
                while True:
                    # 解析每一页的url
                    self.parse_sheets_url(next_url)
                    # 拿到列表
                    ul_list = self.driver_sheets.find_element_by_tag_name("ul").find_elements_by_tag_name("li")

                    for i in ul_list:
                        song = {}
                        # 获取值
                        song['song_url'] = i.find_element_by_tag_name("a").get_attribute("href")
                        self.song_sheet_queue.put(song['song_url'])


                    # 获取下一页
                    next_url = self.driver_sheets.find_elements_by_xpath("//a[@class='zbtn znxt']")
                    next_url = next_url[0].get_attribute("href") if len(next_url) > 0 else None
                    if next_url is None:
                        break
                        # page -= 1

            except Exception as e:
                pass
            self.category_url_queue.task_done()

    def download(self):

        while True:
            try:
                url = self.song_url_queue.get()
                name = self.name_queue.get()
                print('正在下载:{}。。。'.format(name))
                response = requests.get(url).content
                f = open('/home/python/workspace/spider/mp3/{}.mp3'.format(name), 'wb')
                f.write(response)
                f.close()
            except Exception as e:
                pass

            self.song_url_queue.task_done()
            self.name_queue.task_done()

    def run(self):
        # 构建url
        # 发送请求
        thread_list = []

        # 提取分类数据
        # t_category = threading.Thread(target=self.get_category_list)
        # thread_list.append(t_category)
        self.get_category_list()

        # 根据类别获取歌单
        t_sheets = threading.Thread(target=self.sheets_by_category)
        thread_list.append(t_sheets)

        t_song = threading.Thread(target=self.songs_by_sheet)
        thread_list.append(t_song)

        # 5个线程 主要用来下载歌曲
        for i in range(5):
            down = threading.Thread(target=self.download)
            thread_list.append(down)

        for t in thread_list:
            t.setDaemon(True)  # 把子线程设置为守护线程
            t.start()

        for q in [self.song_url_queue,self.name_queue,self.song_sheet_queue,self.category_url_queue]:
            q.join()

if __name__ == '__main__':
    music = Music()
    music.run()
 

你可能感兴趣的:(使用selenium爬取网易云音乐)