使用Python爬取并下载喜马拉雅专辑
pip install requests
pip install lxml
pip install selenium
用Chrome
浏览器打开专辑页面
F12打开调试窗口
Network
选项卡albumId=9742789
关键字的请求 https://www.ximalaya.com/revision/album?albumId=9742789
Host: www.ximalaya.com
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-CN,en-US;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Connection: keep-alive
Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VPCTBVLOKRYSBW@X; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
Upgrade-Insecure-Requests: 1
{
"ret": 200,
"msg": "成功",
"data": {
"isSelfAlbum": false,
"currentUid": 33137912,
"albumId": 9742789,
"mainInfo": {
"albumStatus": 1,
"showApplyFinishBtn": false,
"showEditBtn": false,
"showTrackManagerBtn": false,
"showInformBtn": true,
"cover": "//imagev2.xmcdn.com/group69/M01/51/D8/wKgMeV3L1dizfiHjAAC_6wnqGFY23.jpeg",
"albumTitle": "郭德纲高清相声集【精选】",
"crumbs": {
"categoryId": 12,
"categoryPinyin": "xiangsheng",
"categoryTitle": "相声评书",
"subcategoryId": 20,
"subcategoryName": "相声",
"subcategoryDisplayName": "相声",
"subcategoryCode": "xiangsheng"
},
"updateDate": "2020-04-01",
"createDate": "2017-08-02",
"playCount": 368527505,
"isPaid": false,
"isFinished": 1,
"metas": [
{
"metaValueId": 1684,
"metaDataId": 108,
"categoryId": 12,
"isSubCategory": false,
"categoryName": "xiangsheng",
"categoryPinyin": "xiangsheng",
"metaValueCode": "guodegang",
"metaDisplayName": "郭德纲",
"link": "/xiangsheng/xiangsheng/mr108t1684/"
},
{
"metaValueId": 18,
"metaDataId": 47,
"categoryId": 12,
"isSubCategory": false,
"categoryName": "xiangsheng",
"categoryPinyin": "xiangsheng",
"metaValueCode": "duikou",
"metaDisplayName": "对口",
"link": "/xiangsheng/xiangsheng/mr47t18/"
},
],
"isSubscribe": true,
"richIntro": "",
"shortIntro": "",
"detailRichIntro": "",
"isPublic": true,
"hasBuy": false,
"vipType": 0,
"canCopyText": true,
"subscribeCount": 1200469,
"sellingPoint": {},
"personalDescription": "",
"bigshotRecommend": "",
"outline": "郭德纲高清相声集,喜马独家音频放送
",
"customTitle": "",
"produceTeam": "",
"recommendReason": "郭德纲献给老司机们的福利,不可错过"
},
"anchorInfo": {
"anchorId": 1000202,
"anchorCover": "//imagev2.xmcdn.com/group1/M00/0B/3D/wKgDrlESHqyTqakZAADewk1yMt8360.jpg",
"showFollowBtn": false,
"anchorName": "德云社郭德纲相声VIP",
"anchorGrade": 16,
"anchorGradeType": 2,
"anchorAlbumsCount": 95,
"anchorTracksCount": 4772,
"anchorFollowsCount": 9,
"anchorFansCount": 13625314,
"personalIntroduction": "郭德纲领衔德云社唯一授权音频平台。郭德纲相声,包括其著名的君臣斗、马寿出世、宋金刚押宝、解学士等。",
"showAnchorAlbumModel": true,
"anchorAlbumList": [
{
"albumId": 35825497,
"albumTitle": "德云社20周年闭幕庆典 2017",
"cover": "//imagev2.xmcdn.com/group78/M06/13/37/wKgO4F57GOHh4sHOAAuseqOKTow064.jpg",
"playCount": 283972,
"tracksCount": 6,
"anchorId": 1000202,
"anchorName": "德云社郭德纲相声VIP",
"url": "/xiangsheng/35825497/"
},
{
"albumId": 35200768,
"albumTitle": "郭德纲相声专场 熊本站2017",
"cover": "//imagev2.xmcdn.com/group74/M09/30/EF/wKgO0l5rKwuwXxIzAAeRgVnaH9E885.jpg",
"playCount": 1108877,
"tracksCount": 6,
"anchorId": 1000202,
"anchorName": "德云社郭德纲相声VIP",
"url": "/xiangsheng/35200768/"
}
],
"hasMoreBtn": true,
"logoType": 4
},
"tracksInfo": {
"trackTotalCount": 129,
"sort": 0,
"tracks": [
{
"index": 1,
"trackId": 46430558,
"isPaid": false,
"tag": 0,
"title": "郭德纲《学徒艰辛》老郭打磕巴了,罕见",
"playCount": 18845946,
"showLikeBtn": true,
"isLike": false,
"showShareBtn": true,
"showCommentBtn": true,
"showForwardBtn": true,
"createDateFormat": "2年前",
"url": "/xiangsheng/9742789/46430558",
"duration": 1993,
"isVideo": false,
"videoCover": null,
"isVipFirst": false,
"breakSecond": 0,
"length": 1993
},
{
"index": 30,
"trackId": 46219386,
"isPaid": false,
"tag": 0,
"title": "郭德纲《于谦相亲被强吻》皮条胡同拉家",
"playCount": 2859168,
"showLikeBtn": true,
"isLike": false,
"showShareBtn": true,
"showCommentBtn": true,
"showForwardBtn": true,
"createDateFormat": "2年前",
"url": "/xiangsheng/9742789/46219386",
"duration": 1480,
"isVideo": false,
"videoCover": null,
"isVipFirst": false,
"breakSecond": 0,
"length": 1480
}
],
"pageNum": 1,
"pageSize": 30,
"lastPlayTrackId": 46219230
},
"subSiteAlbumUrl": "",
"recommendKw": {
"sourceKw": "郭德纲高清相声集【精选】",
"recommendText": [
"听云鹏的相声",
"单田芳水浒",
"郭德纲相声超清",
"郭德纲于谦高清相声"
]
},
"draft": null,
"isTemporaryVIP": false
}
}
getTracksList
关键字的请求URL
https://www.ximalaya.com/revision/album/v1/getTracksList?albumId=9742789&pageNum=1&pageSize=1000
Host: www.ximalaya.com
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-CN,en-US;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Connection: keep-alive
Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VZ_OXOB[VZVU^OBWN; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
Upgrade-Insecure-Requests: 1
Cache-Control: max-age=0, no-cache
TE: Trailers
Pragma: no-cache
"ret": 200,
"data": {
"currentUid": 33137912,
"albumId": 9742789,
"trackTotalCount": 129,
"sort": 0,
"tracks": [
{
"index": 1,
"trackId": 46430558,
"isPaid": false,
"tag": 0,
"title": "郭德纲《学徒艰辛》老郭打磕巴了,罕见",
"playCount": 18826557,
"showLikeBtn": true,
"isLike": false,
"showShareBtn": true,
"showCommentBtn": true,
"showForwardBtn": true,
"createDateFormat": "2年前",
"url": "/xiangsheng/9742789/46430558",
"duration": 1993,
"isVideo": false,
"videoCover": null,
"isVipFirst": false,
"breakSecond": 742,
"length": 1993
},
{
"index": 2,
"trackId": 46430560,
"isPaid": false,
"tag": 0,
"title": "郭德纲《三米高的马桶》上厕所靠重力",
"playCount": 18212042,
"showLikeBtn": true,
"isLike": false,
"showShareBtn": true,
"showCommentBtn": true,
"showForwardBtn": true,
"createDateFormat": "2年前",
"url": "/xiangsheng/9742789/46430560",
"duration": 1898,
"isVideo": false,
"videoCover": null,
"isVipFirst": false,
"breakSecond": 0,
"length": 1898
},
{
"index": 3,
"trackId": 46430562,
"isPaid": false,
"tag": 0,
"title": "郭德纲 《京中名妓》你先应付着",
"playCount": 13770315,
"showLikeBtn": true,
"isLike": false,
"showShareBtn": true,
"showCommentBtn": true,
"showForwardBtn": true,
"createDateFormat": "2年前",
"url": "/xiangsheng/9742789/46430562",
"duration": 1683,
"isVideo": false,
"videoCover": null,
"isVipFirst": false,
"breakSecond": 0,
"length": 1683
}
],
"pageNum": 1,
"pageSize": 3,
"superior": [],
"lastPlayTrackId": 46430558
}
}
id=46430558
关键字(播放的声音ID
)的请求https://www.ximalaya.com/revision/play/v1/audio?id=46430558&ptype=1
Host: www.ximalaya.com
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: zh-CN,en-US;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Connection: keep-alive
Cookie: _xmLog=xm_k3y398c7i05jvx; device_id=xm_1575874949738_k3y399bezjz1wg; 1&remember_me=y; 1&_token=33137912&FF000ABB23B848DBA185E16B631D2DF5NdVE12943B8C0EBB477803F032FA8A489EDC16D751007A6314F14EFABE75DD5DFD1; s&e=e7046fac04d436894cbdd17a6a8743e8; s&a=%1FP_WT%09%1A%04J_%0BWYYCZN%0C%09%07%0AR@%02@%0AWXZT%1E[VZCSBRMOKRYSBZOW; x_xmly_traffic=utm_source%253A%2526utm_medium%253A%2526utm_campaign%253A%2526utm_content%253A%2526utm_term%253A%2526utm_from%253A
Upgrade-Insecure-Requests: 1
Cache-Control: max-age=0
TE: Trailers
{
"ret": 200,
"data": {
"trackId": 46430558,
"canPlay": true,
"isPaid": false,
"hasBuy": true,
"src": "https://fdfs.xmcdn.com/group31/M0B/6C/F8/wKgJX1mG0xDCFUfeAPYI4CGwxyI618.m4a",
"albumIsSample": false,
"sampleDuration": 0,
"isBaiduMusic": false,
"firstPlayStatus": true
}
}
从上面的分析可以看出,喜马拉雅返回的数据全都是 JSON 格式的,而且都是静态的,这就给我们爬取带来了非常大的便利,只要找到了请求的URL,然后自己组装一下请求URL的参数,然后用selenium
或requests
请求,从返回的 JSON 数据中提取出我们想要的信息,然后保存即可。
python爬取页面有两种常用的方式,一种是用 requests 直接发送get请求,requests 可以自定义请求头信息,来伪装成一个浏览器。这种方式有一个弊端:如果页面时动态的,页面上的信息都是通过 js脚本动态加载生成的,那我们用这种方式get到的页面上就跟在浏览器上看到的不一样。
另一种是用 selenium 去操作浏览器,在后台自动发送请求,得到页面返回的数据。因为这种方式是用真正的浏览器去发送的请求,比如用 selenium 操控 Chrome 浏览器去请求目标页面,Chrome 浏览器会自动执行页面中的js脚本,动态加载页面,最终返回给我们的页面就跟我们在浏览器上看到的一样。
通过 selenium 请求页面有另外一个好处就是,因为我们操作一个真实的浏览器去请求的页面,所以就更不容易被网站的 反爬虫机制 发现。所以我们这次的爬虫里面采用的就是这种方式。
def __WebDriverInit(self, timeout=20):
if self.__WebDriver == None:
if self.__WebBrowser == "Chrome":
coptions = webdriver.ChromeOptions()
coptions.headless = True
# coptions.add_argument('--headless') # 无头参数
# coptions.add_argument('--disable-gpu')
# 配了环境变量第一个参数就可以省了,不然传绝对路径
self.__WebDriver = webdriver.Chrome(options=coptions)
self.__WebDriver.implicitly_wait(timeout)
elif self.__WebBrowser == "Firefox":
foptions = webdriver.FirefoxOptions()
foptions.headless = True
# foptions.add_argument('-headless') # 无头参数
# 配了环境变量第一个参数就可以省了,不然传绝对路径
self.__WebDriver = webdriver.Firefox(options=foptions)
self.__WebDriver.implicitly_wait(timeout)
elif self.__WebBrowser == "Edge":
self.__WebDriver = webdriver.Edge()
self.__WebDriver.implicitly_wait(timeout)
else:
self.__Debug("Invalid Browser Type [{0}]".format(self.__WebBrowser));
def __WebDriverExit(self):
if self.__WebDriver != None:
self.__WebDriver.close()
def __WebDriverGet(self, url):
htmlpage = None
if url == None or len(url) == 0:
self.__Debug('Please use a valid url!')
return None
try:
self.__WebDriver.get(url)
htmlpage = self.__WebDriver.page_source
except:
self.__Debug("get webpage {0} error".format(url))
finally:
return htmlpage
def __RequestsGet(self, url):
phtml = None
page = None
if url == None or len(url) == 0:
self.__Debug('Please use a valid url!')
return None
try:
# 选择一个随机的User-Agent
self.WebHeader["User-Agent"] = random.choice(
WebSpider.WebUAList)
# requests请求得到页面
page = requests.get(url=url, headers=self.WebHeader,
timeout=self.__TimeOut) # 请求指定的页面
# 打印页面的编码方式
self.__Debug("page.encoding = [{0}]".format(page.encoding))
if page.encoding == "ISO-8859-1":
page.encoding = "utf-8" # "gb2312" # 转换页面的编码为gb2312(避免中文乱码)
phtml = page.text # 提取请求结果中包含的html文本
self.__Debug("requests success")
# page.close() # 关闭requests请求
# 抛出异常
except requests.exceptions.RequestException as e:
self.__Debug("requests error:[{0}]".format(e))
phtml = None
# if page != None:
# page.close()
finally:
if page != None:
page.close()
return phtml
def Gethtml(self, url):
"""
Download html page using requests or selenium.
:Args:
- url: html page url
:Usage:
html = ws.Gethtml("https://www.ithome.com/")
"""
if self.__UseBrowser == True:
return self.__WebDriverGet(url)
else:
return self.__RequestsGet(url)
def GetFile(self, furl, fpath='auto', fname='auto'):
"""
Download files using requests.
:Args:
- furl: file url
- fpath: file save path. If it is , we will automatically selected a
path(usually the folder named on the desktop with the current date).
- fname: file save name. if it's , we will automatically selected a
path from url.
:Usage:
You need to set first to download file!
ws.GetFile('https://www.xxx.com/aaa.jpg', 'D:/Jpgs')
"""
if furl == None or len(furl) == 0:
self.__Debug('Please use a valid file url!')
return
if fpath == 'auto':
fpath = self.__AutoPath()
else:
fpath = self.__AutoPath(fpath)
if fname == 'auto':
fname = os.path.basename(furl) # 从url中提取文件名
fresp = None
try:
if os.path.exists(fpath) != True:
os.makedirs(fpath) # 如果指定的文件夹不存在就递归创建
ffull_path = os.path.join(fpath, fname)
if os.path.isfile(ffull_path) == True: # 判断文件是否存在
self.__Debug('file <{0}> is exists'.format(ffull_path))
return
# 选择一个随机的User-Agent
self.WebHeader["User-Agent"] = random.choice(WebSpider.WebUAList)
fresp = requests.get(
furl, headers=self.WebHeader, timeout=self.__TimeOut)
fdata = fresp.content # 将他拷贝到本地文件 wb代表写入二进制文本
with open(ffull_path, 'wb') as ff:
ff.write(fdata) # 把图片数据写入文件。with语句会自动关闭f
self.__Debug("save file to :{0}".format(ffull_path))
except requests.exceptions.RequestException as e:
self.__Debug(
"download file {0} error, exceptions[{1}]".format(fname, e))
finally:
if fresp != None:
fresp.close()