完整版的爬取网易云音乐的排行榜单,和推荐榜单,热评
直接上代码,代码写的很清楚
为了防止被封我们先做个随机获取User_Agent
"""随机获取请求头"""
def get_ua():
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36','(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
)
return ua
我这里用了代理池
直接上完整的代码
class WangYiyun:
def __init__(self,ua):
self.User_Agent = ua
self.Referer = 'http://music.163.com/'
self.Host = 'music.163.com'
self.Accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
self.s = requests.session()
self.cookies = {
'_ntes_nuid': '132b56eb04cd7ae3dc141a67f1e00b92',
'__gads': 'ID=edd475e49c259564:T=1522070635:S=ALNI_MaL6zNCOchTwNrS8aso4KJa96dHsw',
'vjuids': '-202d8c038.162627acbc3.0.0c44fd974cd69',
'_iuqxldmzr_': '32',
'__utmz': '94650624.1537866836.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'WM_TID': 'S4BoTLNXpXBu1XuKXWJGvK2VNSl8A1R2',
'mail_psc_fingerprint': '4a2ba8794d7a7cb253130b93cf150f04',
'usertrack': 'ezq0o1vGzRQe23jDAzXKAg==',
'_ga': 'GA1.2.1206578097.1539755287',
'UM_distinctid': '1669aca0685ab-0d8c2646d156b1-5e4b2519-100200-1669aca0687451',
'vjlast': '1522070637.1544581017.11',
'__utma': '94650624.1971910903.1537866836.1543304403.1547020521.5',
'_ntes_nnid': '132b56eb04cd7ae3dc141a67f1e00b92,1551923908660',
'vinfo_n_f_l_n3': 'ed239199997e7529.1.17.1522070637531.1551150620657.1552439807970',
'__f_': '1552527380802',
'JSESSIONID-WYYY': 'VzxRm974Y8WbPeeFTyi%2BR9C769wQ1DF7hvjo2HOqEaYkajfA94%5CNme%2BBHQKfsmGq%2BTZYMus4Xgb%2B76sMsgYwudT%5CI1Hh6%2BAK2vx%2Fi4gsdSadfFT9%2F7ant5ST507rZVCaUpAjobt0UhvhngPgZ%2FJdxt%5CvCOXYGrdf9ixCsf%2BHgSeeTbxT%3A1553739054373',
'WM_NI': 'JiBtIXn2eLKlhVwpn8xGT5hRgUqgTMx2%2FEfUXF0TUNwuzRjBZ8lysYoCPgJOgFFBrUJKAncSFlRqyx7Br1S6LwQT2gA0MeujdELLRIjJe42aHjtsBTQo3MV3DTHR260oR3M%3D',
'WM_NIKE': '9ca17ae2e6ffcda170e2e6eea6c8499aaead95d673a1928ea7c54b829e8aafb86db28f8c92ec6983abadb5ca2af0fea7c3b92a8ab78e95cc54aeeeabadca68ac8ab991ed34bc999cd6eb5aa58bbf88d747bceee1a6e55f9c9a00b5d846b5abb6b3e94a9ae8a49af253ad9da088c945a59a8aa3f15f8a958f8fe25986b49e92fc74f59ebed2d23e90a99fb5b16d83f08386f862a89b84add143aa96abd5b16b8cb6bc94c16093b1fdaed539888c88b3b13ef4b19ed4d037e2a3',
'MUSIC_U': 'd6bf02383c0a0234b8da0b7d717df069eb5f65aef8526abe35411b24a595d461fd2f81d778ffe31c3fa6925a46b62d12586a15998e43a379b6d53907a0dbb4064eb75c1ad5e4f24fde39c620ce8469a8',
'__remember_me': 'true',
'__csrf': '99b47f93e046944078bfca72a1619ef7',
}
self.params = (
('csrf_token', '99b47f93e046944078bfca72a1619ef7'),
)
self.data = {
'params': '/2b9rp81m0RdIyYib3JsCnFPg6yT77pPGhY03ke5khMgbbbTrCEJho/4YG9xXvuh7kM4/+HG/rVH9PlIZ3SwCdFReymdZ3tB8o0T+sxRHREnY6tv3WJDb7HuRMEiYC+a15k61alvkIIhyGuVDBLQNVuhfBrJ2Ee0r6eqCGmbhBtn6R12QW5gLlLC+gQjWcoC4jZdQ6ERN5w4QXpgj0eNKX7cu/ooU/mQyagF23Np6Z4=',
'encSecKey': '5ff6ba868d77aaabdd2f4e27af30c00bc085757e830c96f5624cd4dc1212af0e48d36e437d4930c08fd9f80aece6772a1b09d4c8c1ac7a903c02d03a08c078f399b90a89dbed6a45980c970956d76df60b3234c9e90d03fd2ec17037afdcfe131e48c8f444a129e203635a51dfdc91047a584ecffb373d32ee633cb74efa42d3'
}
self.con_url = 'https://music.163.com/'
"""构建请求头"""
def get_header(self):
headers = {
'Referer' : self.Referer,
'User-Agent' : self.User_Agent,
'Host' : self.Host,
'Accept' : self.Accept,
}
return headers
"""获取代理ip"""
def get_proc(self):
f = open('./wyy/ip.txt', 'r')
content = f.readlines()
proxie_list = []
for tmp in content:
proxies = {
"https": "http://{}".format(tmp.strip()),
}
proxie_list.append(proxies)
proxy = random.choice(proxie_list)
try:
data = requests.get(url='https://www.baidu.com', proxies=proxy, timeout=5)
return proxy
except:
pass
"""验证代理ip"""
def call_proc(self):
cc = self.get_proc()
while True:
if cc:
return cc
else:
cc = self.get_proc()
"""排行榜单"""
def wybd(self):
proxy = self.call_proc()
headers = self.get_header()
# play_url = 'https://music.163.com/discover/toplist' # https://music.163.com/#/discover/playlist 直接的url上面有个 #号。要把这个去掉
play_url = self.con_url + 'discover/toplist' # https://music.163.com/#/discover/playlist 直接的url上面有个 #号。要把这个去掉
# print('play_url<%s>'%play_url)
s = self.s
response = s.get(play_url, headers=headers, proxies=proxy).text
html = etree.HTML(response)
result = html.xpath('//p[@class="name"]/a')
filename = open('./wyy/歌单1' + '.csv', 'a', newline='', encoding='utf_8_sig')
csv_write = csv.writer(filename, dialect='excel')
ranking_list = []
for x in result:
name = x.text
id = (x.get('href')).split("=")[1]
rank_dict = {'r_name': name, 'r_id': id}
ranking_list.append(rank_dict)
csv_write.writerow([name, id])
return ranking_list
"""歌单列表"""
def wygd(self):
proxy = self.call_proc()
headers = self.get_header()
# play_url = 'https://music.163.com/discover/playlist' # https://music.163.com/#/discover/playlist 直接的url上面有个 #号。要把这个去掉
play_url = self.con_url + 'discover/playlist'
s = self.s
response = s.get(play_url, headers=headers, proxies=proxy).text
soup = BeautifulSoup(response, 'html.parser')
ids = soup.select('.dec a')
lis = soup.select('#m-pl-container li')
gd_list = []
filename = open('./wyy/歌单1' + '.csv', 'a', newline='', encoding='utf_8_sig')
csv_write = csv.writer(filename, dialect='excel')
for j in range(len(lis)):
# 获取歌单详情页地址
url = (ids[j]['href']).split("=")[1]
# 获取歌单标题,替换英文分割符
title = ids[j]['title'].replace(',', ',')
# 获取歌单播放量
play = lis[j].select('.nb')[0].get_text()
# 获取歌单贡献者名字
user = lis[j].select('p')[1].select('a')[0].get_text()
# 输出歌单索引页信息
# print(url, title, play, user)
gd_dict = {}
gd_dict['r_name'] = title
gd_dict['r_id'] = url
gd_list.append(gd_dict)
# 将信息写入CSV文件中
# with open('./wyy/云音乐排行榜.csv', 'a', encoding='utf-8') as f:
# f.write(title + ',' + url +'\n')
csv_write.writerow([title, url])
return gd_list
"""爬取单个歌单中的音乐"""
def wygid(self):
proxy = self.call_proc()
headers = self.get_header()
r_lists = self.wybd()
gd_lists = self.wygd()
hb_list = r_lists + gd_lists
# print(111,hb_list)
h_list = []
filename = open('./wyy/单曲1' + '.csv', 'a', newline='', encoding='utf_8_sig')
csv_write = csv.writer(filename, dialect='excel')
for r in hb_list:
# play_url = 'http://music.163.com/playlist?id=3132994959'
play_url = self.con_url + 'playlist?id={}'.format(r['r_id'])
s = self.s
gd_list = []
try:
response = s.get(play_url, headers=headers, proxies=proxy).content
s = BeautifulSoup(response, 'lxml')
main = s.find('ul', {'class': 'f-hide'})
for music in main.find_all('a'):
gd_dict = {}
gd_dict['name'] = music.text
gd_dict['g_id'] = (music['href'].split("="))[1]
# print('{} : {}'.format(music.text, music['href']))
gd_list.append(gd_dict)
csv_write.writerow([gd_dict['name'], gd_dict['g_id']])
except:
logging.info('请求失败')
h_list.append(gd_list)
return h_list
"""热评"""
def wypl(self):
proxy = self.call_proc()
headers = self.get_header()
result = self.wygid()
filename = open('./wyy/云音乐热评1' + '.csv', 'a', newline='', encoding='utf_8_sig')
csv_write = csv.writer(filename, dialect='excel')
data_pl = []
for x in result:
for m in x:
print('歌曲:' ,m)
csv_write.writerow(['歌曲名',m['name']])
csv_write.writerow(['评论','用户名','点赞数'])
try:
response = requests.post('https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}'.format(m['g_id']), headers=headers,params=self.params, cookies=self.cookies, data=self.data,proxies=proxy)
count = response.json()
for i in count['hotComments']:
# print(i)
data_pl.append(i)
csv_write.writerow([i['content'].strip(), i['user']['nickname'],str(i['likedCount'])])
except:
proxy = self.call_proc()
response = requests.post('https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}'.format(m['g_id']), headers=headers,params=self.params, cookies=self.cookies, data=self.data,proxies=proxy)
if response.status_code == 200:
count = response.json()
for i in count['hotComments']:
# print(i)
data_pl.append(i)
csv_write.writerow([i['content'].strip(), i['user']['nickname'],str(i['likedCount'])])
else:
print('请求失败')
return data_pl
def main(self):
header = self.get_header()
print('开始获取headers')
print("headers: <%s>" %header)
proc = self.call_proc()
print('开始获取proc')
print("proc: <%s>" %proc)
# wybd = self.wybd()
# print('开始获取榜单')
# print("wybd: <%s>" % wybd)
#
# wygd = self.wygd()
# print('开始获取歌单列表')
# print("wygd: <%s>" % wygd)
wypl = self.wypl()
print('开始获取评论')
print("wypl: <%s>" % wypl)
# wygid = self.wygid()
# print('开始歌曲id')
# print("wybd: <%s>" % wygid)
if __name__ == '__main__':
ua = get_ua()
wy = WangYiyun(ua)
wy.main()