使用Python爬取并下载喜马拉雅专辑

概述

使用Python爬取并下载喜马拉雅专辑

详细

准备工作

requests安装

pip install requests

lxml安装

pip install lxml

Selenium安装

pip install selenium

分析请求页面

  • Chrome浏览器打开专辑页面

  • F12打开调试窗口

  • 切换到 Network选项卡
  • F5刷新页面

获取专辑信息

  • 在请求结果中找到包含albumId=9742789 关键字的请求
  • 查看这些请求返回的结果中是否包含我们想要的数据

URL

    https://www.ximalaya.com/revision/album?albumId=9742789

HEAD

    Host: www.ximalaya.com
    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
    Accept-Language: zh-CN,en-US;q=0.5
    Accept-Encoding: gzip, deflate, br
    DNT: 1
    Connection: keep-alive
    Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VPCTBVLOKRYSBW@X; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
    Upgrade-Insecure-Requests: 1

PAYLOAD

    {
        "ret": 200,
        "msg": "成功",
        "data": {
            "isSelfAlbum": false,
            "currentUid": 33137912,
            "albumId": 9742789,
            "mainInfo": {
                "albumStatus": 1,
                "showApplyFinishBtn": false,
                "showEditBtn": false,
                "showTrackManagerBtn": false,
                "showInformBtn": true,
                "cover": "//imagev2.xmcdn.com/group69/M01/51/D8/wKgMeV3L1dizfiHjAAC_6wnqGFY23.jpeg",
                "albumTitle": "郭德纲高清相声集【精选】",
                "crumbs": {
                    "categoryId": 12,
                    "categoryPinyin": "xiangsheng",
                    "categoryTitle": "相声评书",
                    "subcategoryId": 20,
                    "subcategoryName": "相声",
                    "subcategoryDisplayName": "相声",
                    "subcategoryCode": "xiangsheng"
                },
                "updateDate": "2020-04-01",
                "createDate": "2017-08-02",
                "playCount": 368527505,
                "isPaid": false,
                "isFinished": 1,
                "metas": [
                    {
                        "metaValueId": 1684,
                        "metaDataId": 108,
                        "categoryId": 12,
                        "isSubCategory": false,
                        "categoryName": "xiangsheng",
                        "categoryPinyin": "xiangsheng",
                        "metaValueCode": "guodegang",
                        "metaDisplayName": "郭德纲",
                        "link": "/xiangsheng/xiangsheng/mr108t1684/"
                    },
                    {
                        "metaValueId": 18,
                        "metaDataId": 47,
                        "categoryId": 12,
                        "isSubCategory": false,
                        "categoryName": "xiangsheng",
                        "categoryPinyin": "xiangsheng",
                        "metaValueCode": "duikou",
                        "metaDisplayName": "对口",
                        "link": "/xiangsheng/xiangsheng/mr47t18/"
                    },
                ],
                "isSubscribe": true,
                "richIntro": "",
                "shortIntro": "",
                "detailRichIntro": "",
                "isPublic": true,
                "hasBuy": false,
                "vipType": 0,
                "canCopyText": true,
                "subscribeCount": 1200469,
                "sellingPoint": {},
                "personalDescription": "",
                "bigshotRecommend": "",
                "outline": "

郭德纲高清相声集,喜马独家音频放送

", "customTitle": "", "produceTeam": "", "recommendReason": "郭德纲献给老司机们的福利,不可错过" }, "anchorInfo": { "anchorId": 1000202, "anchorCover": "//imagev2.xmcdn.com/group1/M00/0B/3D/wKgDrlESHqyTqakZAADewk1yMt8360.jpg", "showFollowBtn": false, "anchorName": "德云社郭德纲相声VIP", "anchorGrade": 16, "anchorGradeType": 2, "anchorAlbumsCount": 95, "anchorTracksCount": 4772, "anchorFollowsCount": 9, "anchorFansCount": 13625314, "personalIntroduction": "郭德纲领衔德云社唯一授权音频平台。郭德纲相声,包括其著名的君臣斗、马寿出世、宋金刚押宝、解学士等。", "showAnchorAlbumModel": true, "anchorAlbumList": [ { "albumId": 35825497, "albumTitle": "德云社20周年闭幕庆典 2017", "cover": "//imagev2.xmcdn.com/group78/M06/13/37/wKgO4F57GOHh4sHOAAuseqOKTow064.jpg", "playCount": 283972, "tracksCount": 6, "anchorId": 1000202, "anchorName": "德云社郭德纲相声VIP", "url": "/xiangsheng/35825497/" }, { "albumId": 35200768, "albumTitle": "郭德纲相声专场 熊本站2017", "cover": "//imagev2.xmcdn.com/group74/M09/30/EF/wKgO0l5rKwuwXxIzAAeRgVnaH9E885.jpg", "playCount": 1108877, "tracksCount": 6, "anchorId": 1000202, "anchorName": "德云社郭德纲相声VIP", "url": "/xiangsheng/35200768/" } ], "hasMoreBtn": true, "logoType": 4 }, "tracksInfo": { "trackTotalCount": 129, "sort": 0, "tracks": [ { "index": 1, "trackId": 46430558, "isPaid": false, "tag": 0, "title": "郭德纲《学徒艰辛》老郭打磕巴了,罕见", "playCount": 18845946, "showLikeBtn": true, "isLike": false, "showShareBtn": true, "showCommentBtn": true, "showForwardBtn": true, "createDateFormat": "2年前", "url": "/xiangsheng/9742789/46430558", "duration": 1993, "isVideo": false, "videoCover": null, "isVipFirst": false, "breakSecond": 0, "length": 1993 }, { "index": 30, "trackId": 46219386, "isPaid": false, "tag": 0, "title": "郭德纲《于谦相亲被强吻》皮条胡同拉家", "playCount": 2859168, "showLikeBtn": true, "isLike": false, "showShareBtn": true, "showCommentBtn": true, "showForwardBtn": true, "createDateFormat": "2年前", "url": "/xiangsheng/9742789/46219386", "duration": 1480, "isVideo": false, "videoCover": null, "isVipFirst": false, "breakSecond": 0, "length": 1480 } ], "pageNum": 1, "pageSize": 30, "lastPlayTrackId": 46219230 }, "subSiteAlbumUrl": "", "recommendKw": { "sourceKw": "郭德纲高清相声集【精选】", "recommendText": [ "听云鹏的相声", "单田芳水浒", "郭德纲相声超清", "郭德纲于谦高清相声" ] }, "draft": null, "isTemporaryVIP": false } }

获取专辑列表

  • 在请求结果中找到包含 getTracksList关键字的请求
  • 查看这些请求返回的结果中是否包含我们想要的数据

 URL

    https://www.ximalaya.com/revision/album/v1/getTracksList?albumId=9742789&pageNum=1&pageSize=1000

HEAD

    Host: www.ximalaya.com
    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
    Accept-Language: zh-CN,en-US;q=0.5
    Accept-Encoding: gzip, deflate, br
    DNT: 1
    Connection: keep-alive
    Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VZ_OXOB[VZVU^OBWN; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
    Upgrade-Insecure-Requests: 1
    Cache-Control: max-age=0, no-cache
    TE: Trailers
    Pragma: no-cache

PAYLOAD

    "ret": 200,
    "data": {
        "currentUid": 33137912,
        "albumId": 9742789,
        "trackTotalCount": 129,
        "sort": 0,
        "tracks": [
            {
                "index": 1,
                "trackId": 46430558,
                "isPaid": false,
                "tag": 0,
                "title": "郭德纲《学徒艰辛》老郭打磕巴了,罕见",
                "playCount": 18826557,
                "showLikeBtn": true,
                "isLike": false,
                "showShareBtn": true,
                "showCommentBtn": true,
                "showForwardBtn": true,
                "createDateFormat": "2年前",
                "url": "/xiangsheng/9742789/46430558",
                "duration": 1993,
                "isVideo": false,
                "videoCover": null,
                "isVipFirst": false,
                "breakSecond": 742,
                "length": 1993
            },
            {
                "index": 2,
                "trackId": 46430560,
                "isPaid": false,
                "tag": 0,
                "title": "郭德纲《三米高的马桶》上厕所靠重力",
                "playCount": 18212042,
                "showLikeBtn": true,
                "isLike": false,
                "showShareBtn": true,
                "showCommentBtn": true,
                "showForwardBtn": true,
                "createDateFormat": "2年前",
                "url": "/xiangsheng/9742789/46430560",
                "duration": 1898,
                "isVideo": false,
                "videoCover": null,
                "isVipFirst": false,
                "breakSecond": 0,
                "length": 1898
            },
            {
                "index": 3,
                "trackId": 46430562,
                "isPaid": false,
                "tag": 0,
                "title": "郭德纲 《京中名妓》你先应付着",
                "playCount": 13770315,
                "showLikeBtn": true,
                "isLike": false,
                "showShareBtn": true,
                "showCommentBtn": true,
                "showForwardBtn": true,
                "createDateFormat": "2年前",
                "url": "/xiangsheng/9742789/46430562",
                "duration": 1683,
                "isVideo": false,
                "videoCover": null,
                "isVipFirst": false,
                "breakSecond": 0,
                "length": 1683
            }
        ],
        "pageNum": 1,
        "pageSize": 3,
        "superior": [],
        "lastPlayTrackId": 46430558
    }
}

获取播放文件

  • 清空请求列表
  • 点击任意一个声音开始播放
  • 在新的请求结果中找到包含 id=46430558关键字(播放的声音ID)的请求
  • 查看这些请求返回的结果中是否包含我们想要的数据

URL

https://www.ximalaya.com/revision/play/v1/audio?id=46430558&ptype=1

HEAD

    Host: www.ximalaya.com
    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
    Accept-Language: zh-CN,en-US;q=0.5
    Accept-Encoding: gzip, deflate, br
    DNT: 1
    Connection: keep-alive
    Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VZCSBRMOKRYSBZOW; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
    Upgrade-Insecure-Requests: 1
    Cache-Control: max-age=0
    TE: Trailers

PAYLOAD

    {
        "ret": 200,
        "data": {
            "trackId": 46430558,
            "canPlay": true,
            "isPaid": false,
            "hasBuy": true,
            "src": "https://fdfs.xmcdn.com/group31/M0B/6C/F8/wKgJX1mG0xDCFUfeAPYI4CGwxyI618.m4a",
            "albumIsSample": false,
            "sampleDuration": 0,
            "isBaiduMusic": false,
            "firstPlayStatus": true
        }
    }

代码编写

从上面的分析可以看出,喜马拉雅返回的数据全都是 JSON 格式的,而且都是静态的,这就给我们爬取带来了非常大的便利,只要找到了请求的URL,然后自己组装一下请求URL的参数,然后用seleniumrequests请求,从返回的 JSON 数据中提取出我们想要的信息,然后保存即可。

python爬取页面有两种常用的方式,一种是用 requests 直接发送get请求,requests 可以自定义请求头信息,来伪装成一个浏览器。这种方式有一个弊端:如果页面时动态的,页面上的信息都是通过 js脚本动态加载生成的,那我们用这种方式get到的页面上就跟在浏览器上看到的不一样。

另一种是用 selenium 去操作浏览器,在后台自动发送请求,得到页面返回的数据。因为这种方式是用真正的浏览器去发送的请求,比如用 selenium 操控 Chrome 浏览器去请求目标页面,Chrome 浏览器会自动执行页面中的js脚本,动态加载页面,最终返回给我们的页面就跟我们在浏览器上看到的一样。

通过 selenium 请求页面有另外一个好处就是,因为我们操作一个真实的浏览器去请求的页面,所以就更不容易被网站的 反爬虫机制 发现。所以我们这次的爬虫里面采用的就是这种方式。

封装WebSpider爬虫库

selenium的初始化

    def __WebDriverInit(self, timeout=20):
        if self.__WebDriver == None:
            if self.__WebBrowser == "Chrome":
                coptions = webdriver.ChromeOptions()
                coptions.headless = True
                # coptions.add_argument('--headless')  # 无头参数
                # coptions.add_argument('--disable-gpu')
                # 配了环境变量第一个参数就可以省了,不然传绝对路径
                self.__WebDriver = webdriver.Chrome(options=coptions)
                self.__WebDriver.implicitly_wait(timeout)
            elif self.__WebBrowser == "Firefox":
                foptions = webdriver.FirefoxOptions()
                foptions.headless = True
                # foptions.add_argument('-headless')  # 无头参数
                # 配了环境变量第一个参数就可以省了,不然传绝对路径
                self.__WebDriver = webdriver.Firefox(options=foptions)
                self.__WebDriver.implicitly_wait(timeout)
            elif self.__WebBrowser == "Edge":
                self.__WebDriver = webdriver.Edge()
                self.__WebDriver.implicitly_wait(timeout)
            else:
                self.__Debug("Invalid Browser Type [{0}]".format(self.__WebBrowser));
    def __WebDriverExit(self):
        if self.__WebDriver != None:
            self.__WebDriver.close()

使用selenium下载页面

    def __WebDriverGet(self, url):
        htmlpage = None
        if url == None or len(url) == 0:
            self.__Debug('Please use a valid url!')
            return None
        try:
            self.__WebDriver.get(url)
            htmlpage = self.__WebDriver.page_source
        except:
            self.__Debug("get webpage {0} error".format(url))
        finally:
            return htmlpage

使用requests下载页面

    def __RequestsGet(self, url):
        phtml = None
        page = None
        if url == None or len(url) == 0:
            self.__Debug('Please use a valid url!')
            return None
        try:
            # 选择一个随机的User-Agent
            self.WebHeader["User-Agent"] = random.choice(
                WebSpider.WebUAList)
            # requests请求得到页面
            page = requests.get(url=url, headers=self.WebHeader,
                                timeout=self.__TimeOut)  # 请求指定的页面
            # 打印页面的编码方式
            self.__Debug("page.encoding = [{0}]".format(page.encoding))
            if page.encoding == "ISO-8859-1":
                page.encoding = "utf-8"  # "gb2312"  # 转换页面的编码为gb2312(避免中文乱码)
            phtml = page.text  # 提取请求结果中包含的html文本
            self.__Debug("requests success")
            # page.close()  # 关闭requests请求
        # 抛出异常
        except requests.exceptions.RequestException as e:
            self.__Debug("requests error:[{0}]".format(e))
            phtml = None
            # if page != None:
            #     page.close()
        finally:
            if page != None:
                page.close()
            return phtml

请求目标页面

    def Gethtml(self, url):
        """
        Download html page using requests or selenium.
        :Args:
         - url: html page url
        :Usage:
            html = ws.Gethtml("https://www.ithome.com/")
        """
        if self.__UseBrowser == True:
            return self.__WebDriverGet(url)
        else:
            return self.__RequestsGet(url)

使用requests下载文件

    def GetFile(self, furl, fpath='auto', fname='auto'):
        """
        Download files using requests.
        :Args:
         - furl: file url
         - fpath: file save path. If it is , we will automatically selected a 
            path(usually the folder named on the desktop with the current date).
         - fname: file save name. if it's , we will automatically selected a 
            path from url.
        :Usage:
            You need to set  first to download file!
            ws.GetFile('https://www.xxx.com/aaa.jpg', 'D:/Jpgs')
        """
        if furl == None or len(furl) == 0:
            self.__Debug('Please use a valid file url!')
            return
        if fpath == 'auto':
            fpath = self.__AutoPath()
        else:
            fpath = self.__AutoPath(fpath)
        if fname == 'auto':
            fname = os.path.basename(furl)  # 从url中提取文件名
        fresp = None
        try:
            if os.path.exists(fpath) != True:
                os.makedirs(fpath)  # 如果指定的文件夹不存在就递归创建
            ffull_path = os.path.join(fpath, fname)
            if os.path.isfile(ffull_path) == True:  # 判断文件是否存在
                self.__Debug('file <{0}> is exists'.format(ffull_path))
                return
            # 选择一个随机的User-Agent
            self.WebHeader["User-Agent"] = random.choice(WebSpider.WebUAList)
            fresp = requests.get(
                furl, headers=self.WebHeader, timeout=self.__TimeOut) 
            fdata = fresp.content  # 将他拷贝到本地文件 wb代表写入二进制文本
            with open(ffull_path, 'wb') as ff:
                ff.write(fdata)  # 把图片数据写入文件。with语句会自动关闭f
            self.__Debug("save file to :{0}".format(ffull_path))
        except requests.exceptions.RequestException as e:
            self.__Debug(
                "download file {0} error, exceptions[{1}]".format(fname, e))
        finally:
            if fresp != None:
                fresp.close()

你可能感兴趣的:(大数据,python,开发语言)