2019独角兽企业重金招聘Python工程师标准>>>
此脚本可按照风格分类爬取该分类下所有歌单的歌曲,部分带特殊符号的歌单没做转码,所以跳过,比如 R&B/Soul,访问时需要转成 R%26B%2FSoul
所以用了 try except 抛一下异常就好了,然后继续执行脚本
由于学习不久,以及功能改变较多,最后才形成这个版本,所以代码有点乱,有空再优化~
注意:有个细节 网页访问时 有个 # 号,https://music.163.com/#/discover/playlist/
用url请求时要把 # 号去掉才能拿到完整的数据。
此外,网易云还提供一些 api,比如根据歌单id返回歌单数据的json,这样可以很方便的直接解析
http://music.163.com/api/playlist/detail?id=2720965607
#!/usr/bin/python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import random
import csv
from requests.adapters import HTTPAdapter
import traceback
import time
import datetime
#设置请求头
headers = {
'Host': 'music.163.com',
'Referer': 'http://music.163.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}
#代理ip池 用自己的ip请求多了 会被反爬 封ip
proxies = [{"http": "http://117.191.11.123:8080"},
{"http": "http://118.144.149.123:8080"},
{"http": "http://27.191.234.123:8080"},
{"http": "http://58.249.55.123:8080"}]
proxy = random.choice(proxies) #随机获取一个代理池的ip
print("proxy:"+str(proxy)) #输出记录请求的ip
count = 1 #歌曲数
listnum = 1 #歌单数
# 创建csv文件用于存储数据
nowTime = datetime.datetime.now().strftime('%Y%m%d%H%M%S')#现在
out= open('D:\music_csv_orderByTag'+str(nowTime)+'.csv', 'a', newline='', encoding='utf-8-sig')
csv_write = csv.writer(out, dialect='excel')
csv_write.writerow(["id", "song_id", "song_name", "singer_name", "tags", "playlist_id"]) # 写入列名
#爬取到的数据写入到csv中
urltag = "https://music.163.com/discover/playlist/?order=new" #该地址可以获取歌单的标签分类
rtag = requests.session()
rtag = rtag.get(urltag, headers=headers, proxies=proxy) #获取返回结果
text = rtag.text #这两步是防止中文被转成16进制
rtag = rtag.content
soup = BeautifulSoup(str(text).replace(' ', ' '), "lxml")
#拿到标签列表
resultTag = soup.find_all('a', {'class': 's-fc1'}) #查找所有a标签 class为 s-fc1 的值 ,即标签的值
for mu in resultTag:
cat = mu['data-cat']
if cat == '全部':
print("cat为全部,跳过") #这里爬的是各个分类的歌单歌曲,所以遇到分类“全部”就跳过
else:
offset = 0 #偏移量 url拼接使用
offsetMax = 35 #每一页有35个歌单 初始值设为35 读取到所有歌单数后再修改
urloffset = 'https://music.163.com/discover/playlist/?order=new&cat=' + str(cat)
print("-------urloffset----urloffset----------------")
print(urloffset)
roffset = requests.session()
roffset = BeautifulSoup(roffset.get(urloffset, headers=headers, proxies=proxy).content)
soupoffset = BeautifulSoup(str(roffset).replace(' ', ' '), "lxml")
# 拿到页码歌单数列表
pages = soupoffset.find_all('a', {'class': 'zpgi'})
print("-------urloffset----pages----------------")
print(pages)
pagelen = len(pages)
page = pages[pagelen - 1]
str1 = str(page['href']).split('offset=')
print("-------urloffset----str1----------------")
print(str1)
if 'offset=' in page['href']:
offsetMax = int(str1[1])
print(str1[1])
else:
pass
while offset <= offsetMax:
try:
url = 'https://music.163.com/discover/playlist/?cat=' + str(cat) + '&order=new&limit=35&offset=' + str(offset)
r = requests.session()
r = BeautifulSoup(r.get(url, headers=headers, proxies=proxy).content)
result = r.find_all('a', {'class': 'msk'})
offset += 35
print(url)
for res in result:
url2 = "http://music.163.com/api/playlist/detail?" + res['href'][10:23]
print(str(listnum) + ":" + url2)
listnum += 1
r5 = requests.session()
#设置超时重试次数
r5.mount('http://', HTTPAdapter(max_retries=1))
r5.mount('https://', HTTPAdapter(max_retries=1))
proxy1 = random.choice(proxies)
print("proxy1:"+str(proxy1))
#r5 = r5.get(url2, headers=headers).content
try:
r5 = r5.get(url2, headers=headers, proxies=proxy1, timeout=5).content #超时时间5s
r2 = str(r5, 'utf-8')
print("------------result json------------")
#print(r2)
if str(r2).startswith('b'):
print("startwith b ")
r3 = str(r2)[2:]
r2 = r3[:-1]
text = json.loads(r2)
tracks = text['result']['tracks']
tags = text['result']['tags']
for track in tracks:
#print(str(count)+":"+str(track['id'])+","+track['name']+","+track['artists'][0]['name']+","+str(tags))
singer_name = track['artists'][0]['name']
csv_write.writerow([count, str(track['id']), str(track['name']), str(singer_name), str(tags), str(res['href'][10:23])])
count += 1
except Exception as err:
print("出现部分异常,跳过该页-----" + url2)
print(err)
sleeptime = random.uniform(0.05, 0.3) #随机休眠后再进行访问
print("进入休眠 sleeptime:" + str(sleeptime))
time.sleep(sleeptime)
except Exception as err:
print("出现部分异常,跳过该标签-----" + str(cat))
print(err)
print("write over")
out.close()
print("finish")
此脚本仅用于学习,不可用于其他用途,后果自负。