爬取网易云音乐

效果图


spider_music.py主页面

# coding=gbk
from download import Download
from url_manager import Url_Manager
from html_parser import Html_Parser
from save import Save
from set_text_color import Set_Color


class Spider_Music():
    
    def __init__(self):
        self.download = Download()
        self.url_manager = Url_Manager()
        self.html_parser = Html_Parser()
        self.save = Save()
        self.set_color = Set_Color()
        
    def craw(self,url):
        self.url_manager.addurl({'url':url,'name':'temp'})
    
        while self.url_manager.checknewurllength>0:
            newurl = self.url_manager.geturl()
            
            if self.save.checkfile(newurl['name']):
                self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name']))
                continue
            
            print("开始下载 {} {}".format(newurl['name'],newurl['url']))
            htmlcontent = self.download.download(newurl['url'])
            
            if htmlcontent['htmlcontents'] == None:
                self.url_manager.delUrl(newurl)
                self.url_manager.addurl(newurl)         
                
            newurls,result = self.html_parser.parser(htmlcontent)
            
            self.url_manager.addurls(newurls)           
            self.save.save(result,newurl['name'])
            print("下载完成 {} ".format(newurl['name']))
        print("共下载{}首歌曲".format(self.save.count))
        
    def main(self):
        self.craw('https://music.163.com/#/playlist?id=2492536378')

spider = Spider_Music()
spider.main()

download.py负责下载

# coding=gbk
import re
import requests
from selenium import webdriver
import random
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options


class Download():
    
    __uas = [
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
    "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    ]
    
    __ips = []
    
    headers = {
            #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Referer':'http://music.163.com/',
            'Host':'music.163.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9"
            }
    
    def __init__(self):
        self.url = ''
        #self.__ips = self.get_ip()
    
    def download(self,url):
        self.url = url
        #print(url)
        return self.patterns        
    
    @property
    def patterns(self):
        playlist = re.compile("playlist\?id=\d+")   #匹配歌单
        song = re.compile("song/media/outer/url\?id=\d+")   #匹配下载地址
        
        res = {
            'identify':False,
            'htmlcontents':'',
        }
        
        if re.search(song,self.url):
            res['identify'] = 1
            res['htmlcontents'] = self.getmusic()   #用于获取mp3
        elif re.search(playlist,self.url):
            res['identify'] = 2
            res['htmlcontents'] = self.geturl() #获取网页内容
            
        return res
    
    #未找到高可用代理,功能暂时停止
    def get_ip(self):
        url = "https://www.kuaidaili.com/free/inha/1/"
        res = requests.get(url)
        soup = BeautifulSoup(res.text,'html.parser')
        data = soup.find(id="list").find('tbody').find_all('tr')
        ip_compile= re.compile(r'(\d+\.\d+\.\d+\.\d+)')    # 匹配IP
        port_compile = re.compile(r'(\d+)')                # 匹配端口
        ip = re.findall(ip_compile,str(data))       # 获取所有IP
        port = re.findall(port_compile,str(data))   # 获取所有端口
        return [":".join(i) for i in zip(ip,port)]  # 组合IP+端口,如:115.112.88.23:8080
        
        
                
    def getmusic(self):
        try:
            url = self.getrealurl()
            host = url.strip('http://').split('/')[0]
            headers = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Host": host,
                "User-Agent": self.__uas[random.randint(0,6)]#模拟不同浏览器
                }
            ip = random.choice(self.__ips)
            proxies = {
                'http':'http://'+ip,
                'https':'http://'+ip
                    }
            res = requests.get(url,headers=headers)
        except Exception as e:
            print(e)
            return
        else:
            return res.content
            
    def getrealurl(self):
        res = requests.get(self.url,headers=self.headers)
        return res.url
            
    def geturl(self):
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')            
            brower = webdriver.Chrome("D:\\tools\\chromedriver_win32\\chromedriver.exe",options=chrome_options)#创建driver,参数为插件的路径
            brower.get(self.url)
            brower.switch_to.frame(brower.find_element_by_name("contentFrame"))#切换到指定框架
        except Exception as e:
            print(e)
            return
        else:
            return brower.page_source
        
#d=Download()
#print(d.get_ip())
#res = d.download('http://music.163.com/song/media/outer/url?id=28160459.mp3')
#print(res['identify'])
#print(res['htmlcontents'])

html_parser.py负责网页内容解析

from bs4 import BeautifulSoup

class Html_Parser():
    baseurl = "http://music.163.com/song/media/outer/url?{}.mp3"
    
    def parser(self,res):
        
        if res.get('identify') == 1:
            #print(res['identify'])
            return None,res.get('htmlcontents',False)
            
        else:
            return self.geturls(res['htmlcontents'])
    
    def geturls(self,htmlcontent):
        #print(htmlcontent)
        newsurl=list()
        try:
            soup = BeautifulSoup(htmlcontent,'html.parser')
            songlist = soup.find('table').find_all('tr')[1:]
        
            for link in songlist:
                url = self.baseurl.format(link.find_all('td')[1].find('a')['href'].split('?')[-1])
                name = link.find_all('td')[1].find('a').find('b')['title']
                newsurl.append({'url':url,'name':name})
        except Exception as e:
            print(e)
            pass
        else:
            return newsurl,False

url_manager.pyurl管理器

# coding=gbk
class Url_Manager():
    
    __newurl = list()   #存放未下载的url
    __oldurl = list()   #存放已下载的url
    
    def addurl(self,url):
        if url == None:
            return
        if self.checkurl(url):
            self.__newurl.append(url)
        
    def addurls(self,urls):
        if urls == None:
            return
            
        for url in urls:
            self.addurl(url)
        
    def geturl(self):
        newurl = self.__newurl.pop()
        self.__oldurl.append(newurl)
        return newurl
    
    def delUrl(self,url):
        if url in self.__oldurl:
            self.__oldurl.remove(url)
            
    @property   
    def checknewurllength(self):
        return len(self.__newurl)
    
    def checkurl(self,url):
        if url not in self.__newurl and url not in self.__oldurl:
            return True
        else:
            return False

save.py保存下载内容

# coding=gbk
import os

class Save():
    path="./download/"
    count = 0
    
    def __init__(self):
        self.mkdir(self.path)
    
    def save(self,contents,name):
        if contents and name:
            try:
                with open(self.remove_special_characters(name),'wb') as f:
                    f.write(contents)
            except Exception as e:
                print(e)
                pass
            else:
                self.count+=1
    #创建文件存放目录
    def mkdir(self,path):
        if os.path.exists(path):
            return
        os.makedirs(path)
    
    #防止重复下载 
    def checkfile(self,name):
        if name == 'temp':
            return
        return os.path.exists(self.remove_special_characters(name))
    
    #确保windows下文件可创建成功
    def remove_special_characters(self,string):
        #windows文件名中不能有下列符号:'\\', '/', ':', '*', '?', '"', '<', '>', '|'
        special_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
        for special_character in special_characters:
            string = string.replace(special_character,'')
        return '/'.join([self.path.strip('/'),string.strip()])+".mp3"

set_text_color.py设置cmd窗口显示颜色

# coding=gbk
#参考地址https://blog.csdn.net/wy_97/article/details/79663014

import ctypes,sys
class Set_Color():
    STD_INPUT_HANDLE = -10
    STD_OUTPUT_HANDLE = -11
    STD_ERROR_HANDLE = -12

    # 字体颜色定义 ,关键在于颜色编码,由2位十六进制组成,分别取0~f,前一位指的是背景色,后一位指的是字体色
    #由于该函数的限制,应该是只有这16种,可以前景色与背景色组合。也可以几种颜色通过或运算组合,组合后还是在这16种颜色中

    # Windows CMD命令行 字体颜色定义 text colors
    FOREGROUND_BLACK = 0x00 # black.
    FOREGROUND_DARKBLUE = 0x01 # dark blue.
    FOREGROUND_DARKGREEN = 0x02 # dark green.
    FOREGROUND_DARKSKYBLUE = 0x03 # dark skyblue.
    FOREGROUND_DARKRED = 0x04 # dark red.
    FOREGROUND_DARKPINK = 0x05 # dark pink.
    FOREGROUND_DARKYELLOW = 0x06 # dark yellow.
    FOREGROUND_DARKWHITE = 0x07 # dark white.
    FOREGROUND_DARKGRAY = 0x08 # dark gray.
    FOREGROUND_BLUE = 0x09 # blue.
    FOREGROUND_GREEN = 0x0a # green.
    FOREGROUND_SKYBLUE = 0x0b # skyblue.
    FOREGROUND_RED = 0x0c # red.
    FOREGROUND_PINK = 0x0d # pink.
    FOREGROUND_YELLOW = 0x0e # yellow.
    FOREGROUND_WHITE = 0x0f # white.


    # Windows CMD命令行 背景颜色定义 background colors
    BACKGROUND_BLUE = 0x10 # dark blue.
    BACKGROUND_GREEN = 0x20 # dark green.
    BACKGROUND_DARKSKYBLUE = 0x30 # dark skyblue.
    BACKGROUND_DARKRED = 0x40 # dark red.
    BACKGROUND_DARKPINK = 0x50 # dark pink.
    BACKGROUND_DARKYELLOW = 0x60 # dark yellow.
    BACKGROUND_DARKWHITE = 0x70 # dark white.
    BACKGROUND_DARKGRAY = 0x80 # dark gray.
    BACKGROUND_BLUE = 0x90 # blue.
    BACKGROUND_GREEN = 0xa0 # green.
    BACKGROUND_SKYBLUE = 0xb0 # skyblue.
    BACKGROUND_RED = 0xc0 # red.
    BACKGROUND_PINK = 0xd0 # pink.
    BACKGROUND_YELLOW = 0xe0 # yellow.
    BACKGROUND_WHITE = 0xf0 # white.

    std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
    # get handle
    

    def set_cmd_text_color(self,color, handle=False):
        if handle:
            Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color)
        else:
            Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(self.std_out_handle, color)
        return Bool

    #reset white
    def resetColor(self):
        self.set_cmd_text_color(self.FOREGROUND_GREEN)

    #reset white
    def resetDefault(self):
        self.set_cmd_text_color(self.FOREGROUND_RED | self.FOREGROUND_GREEN | self.FOREGROUND_BLUE)
    
    ###############################################################

    #暗蓝色
    #dark blue
    def printDarkBlue(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKBLUE)
        sys.stdout.write(mess)
        self.resetColor()

    #暗绿色
    #dark green
    def printDarkGreen(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKGREEN)
        sys.stdout.write(mess)
        self.resetColor()

    #暗天蓝色
    #dark sky blue
    def printDarkSkyBlue(mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKSKYBLUE)
        sys.stdout.write(mess)
        self.resetColor()

    #暗红色
    #dark red
    def printDarkRed(self,mess):
        #self.set_back()
        self.set_cmd_text_color(self.FOREGROUND_DARKRED)
        sys.stdout.write(mess)
        self.resetColor()

    #暗粉红色
    #dark pink
    def printDarkPink(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKPINK)
        sys.stdout.write(mess)
        self.resetColor()

    #暗黄色
    #dark yellow
    def printDarkYellow(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKYELLOW)
        sys.stdout.write(mess)
        self.resetColor()

    #暗白色
    #dark white
    def printDarkWhite(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKWHITE)
        sys.stdout.write(mess)
        self.resetColor()

    #暗灰色
    #dark gray
    def printDarkGray(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_DARKGRAY)
        sys.stdout.write(mess)
        self.resetColor()

    #蓝色
    #blue
    def printBlue(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_BLUE)
        sys.stdout.write(mess)
        self.resetColor()

    #绿色
    #green
    def printGreen(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_GREEN)
        sys.stdout.write(mess)
        self.resetColor()

    #天蓝色
    #sky blue
    def printSkyBlue(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_SKYBLUE)
        sys.stdout.write(mess)
        self.resetColor()

    #红色
    #red
    def printRed(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_RED)
        sys.stdout.write(mess)
        self.resetColor()

    #粉红色
    #pink
    def printPink(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_PINK)
        sys.stdout.write(mess)
        self.resetColor()

    #黄色
    #yellow
    def printYellow(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_YELLOW)
        sys.stdout.write(mess)
        self.resetColor()

    #白色
    #white
    def printWhite(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_WHITE)
        sys.stdout.write(mess)
        self.resetColor()

    ##################################################

    #白底黑字
    #white bkground and black text
    def printWhiteBlack(self,mess):
        self.set_cmd_text_color(self.FOREGROUND_BLACK | self.BACKGROUND_WHITE)
        sys.stdout.write(mess)
        self.resetColor()

    #白底黑字
    #white bkground and black text
    def printWhiteBlack_2(self,mess):
        self.set_cmd_text_color(0xf0)
        sys.stdout.write(mess)
        self.resetColor()


    #黄底蓝字
    #white bkground and black text
    def printYellowRed(self,mess):
        self.set_cmd_text_color(BACKGROUND_YELLOW | FOREGROUND_RED)
        sys.stdout.write(mess)
        self.resetColor()


    ##############################################################
    """
    if __name__ == '__main__':

        print
        printDarkBlue('printDarkBlue:暗蓝色文字\n')
        printDarkGreen('printDarkGreen:暗绿色文字\n')
        printDarkSkyBlue(u'printDarkSkyBlue:暗天蓝色文字\n')
        printDarkRed(u'printDarkRed:暗红色文字\n')
        printDarkPink(u'printDarkPink:暗粉红色文字\n')
        printDarkYellow(u'printDarkYellow:暗黄色文字\n')
        printDarkWhite(u'printDarkWhite:暗白色文字\n')
        printDarkGray(u'printDarkGray:暗灰色文字\n')
        printBlue(u'printBlue:蓝色文字\n')
        printGreen(u'printGreen:绿色文字\n')
        printSkyBlue(u'printSkyBlue:天蓝色文字\n')
        printRed(u'printRed:红色文字\n')
        printPink(u'printPink:粉红色文字\n')
        printYellow(u'printYellow:黄色文字\n')
        printWhite(u'printWhite:白色文字\n')
        printWhiteBlack(u'printWhiteBlack:白底黑字输出\n')
        printWhiteBlack_2(u'printWhiteBlack_2:白底黑字输出\n')
        printYellowRed('printYellowRed:黄底红字输出\n')
     """
#c = Set_Color()
#c.printDarkRed(u'printDarkRed:暗红色文字\n')

你可能感兴趣的:(爬取网易云音乐)