多线程采集网易云所有歌单里歌曲url(爬虫)

简单多线程,利用地址池创建多线程,n倍提高爬虫速度,获取音乐文件破解中

import re
import time
import json
import datetime
import threading
import requests_html

path = 'json.txt'

session = requests_html.HTMLSession()
data_urls = []
get_all_song_url = []

class Wangyi(object):
    def __init__(self):
        # 所有歌单页面的url
        self.list_urls = [
            f'https://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset={offset*35}'
            for offset in range(38)
        ]
        self.head = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
        }
        self.proxies = {
            'http': '171.15.66.177:9999'
        }

    # 获取所有页面的url
    def get_all_page_urls(self):
        self.many_thread()
        return data_urls

    # 获取一个歌单page的所有url
    def get_urls(self, url):
        rous = session.get(url, headers=self.head, proxies=self.proxies)
        html = requests_html.etree.HTML(rous.text)
        lis = html.xpath('//*[@id="m-pl-container"]/li')
        for li in lis:
            # 歌单详情url
            url_id = 'https://music.163.com' + li.xpath('./div[1]/a/@href')[0]
            # 歌单名
            song_name = li.xpath('./div[1]/a/@title')[0]
            urls_data = {
                "url_id": url_id,
                "song_name": song_name,
            }
            data_urls.append(urls_data)

    # 获取所有歌单page的url
    def many_thread(self):
        threads = []
        for url in self.list_urls:
            t = threading.Thread(target=Wangyi.get_urls, args=(self, url,))
            threads.append(t)
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

    # 获取一个歌单里所有歌曲的url
    def get_song_url(self, url, song_sheet):
        head = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
        }
        rous = session.get(url, headers=head)
        # print(rous.text)
        html = requests_html.etree.HTML(rous.text)
        links = html.xpath('//*[@class="f-hide"]/li')
        for link in links:
            song_name = link.xpath('./a/text()')[0]
            link = 'https://music.163.com' + link.xpath('./a/@href')[0]
            song_id = re.findall('song\?id=(.*)', link)[0]
            song_dic = {
                "song_sheet": song_sheet,  # 歌单
                "song_name": song_name,  # 歌名
                "link": link,  # 链接
                "song_id": song_id,  # 歌曲id
            }
            get_all_song_url.append(song_dic)

    # 多线程获取所有歌单页面里歌曲url
    def many_thread_get_song_urls(self):
        threads_songs = []
        for urls in self.get_all_page_urls():
            url = urls['url_id']
            song_sheet = urls['song_name']
            t = threading.Thread(target=Wangyi.get_song_url, args=(self, url, song_sheet,))
            threads_songs.append(t)
        for threads_song in threads_songs:
            threads_song.start()
        for threads_song in threads_songs:
            threads_song.join()

if __name__ == "__main__":
    Wangyi().many_thread_get_song_urls()
    string = json.dumps(get_all_song_url)
    with open(path, 'w') as f:
        f.write(string)
    print(string)

有12980首歌
多线程采集网易云所有歌单里歌曲url(爬虫)_第1张图片

你可能感兴趣的:(爬虫)