import re
import os
import csv
import json
import time
import requests
from urllib.parse import urlencode
class Spider:
def __init__(self):
self.path = '.'
self.img_path = 'imgs'
self.csvfilename = 'datas.csv'
self.logfilename = 'run.log'
self.flag=0
def run(self):
strat = time.time()
if os.path.exists('{}/{}'.format(self.path, self.csvfilename)):
os.remove('{}/{}'.format(self.path, self.csvfilename))
self.save_data(['名字', '别名', '粉丝数', '专辑数', 'mv数', '音乐发行数量', '头像链接'])
for url in self.mkurl():
for artistFans, albumNum, mvNum, musicNum, name, aartist, pic in self.parse_page(self.get_page(url)):
self.save_data([name, aartist, artistFans,
albumNum, mvNum, musicNum, pic])
self.img_downloader(pic, name)
end = time.time()
self.runtime = end - strat
def mkurl(self):
params = {
'category': '0',
'prefix': '',
'pn': '3',
'rn': '102',
'httpsStatus': '1',
'reqId': '01a9c5f0-bb51-11ea-a4ae-6f62032e0cbe',
}
for i in range(1, 4):
params['pn'] = i
yield 'http://www.kuwo.cn/api/www/artist/artistInfo?'+urlencode(params)
def get_page(self, url):
headers = {
'Cookie': 'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1593486940; _ga=GA1.2.2081093336.1593486940;'
' _gid=GA1.2.1984756662.1593486940; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1593505734;'
' kw_token=TBONPSLXNY',
'csrf': 'TBONPSLXNY',
'Host': 'www.kuwo.cn',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.kuwo.cn/singers',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
response = requests.get(url, headers=headers)
return response
def parse_page(self, response):
for item in response.json().get('data').get('artistList'):
text = json.dumps(item,ensure_ascii=False)
def pop(attr): return attr[0] if attr else ''
artistFans = pop(re.findall('"artistFans":(.*?),', text)).strip()
albumNum = pop(re.findall('"albumNum":(.*?),', text)).strip()
mvNum = pop(re.findall('"mvNum":(.*?),', text)).strip()
musicNum = pop(re.findall('"musicNum":(.*?),', text)).strip()
name = pop(re.findall('"name": "(.*?)",', text)).strip()
aartist = pop(re.findall('"aartist": "(.*?)",', text)).strip()
pic = pop(re.findall('"pic": "(.*?)",', text)).strip()
yield artistFans, albumNum, mvNum, musicNum, name, aartist, pic
def save_data(self, item):
'''
保存文件
'''
print('-', end='')
with open('{}/{}'.format(self.path, self.csvfilename), 'a', encoding='utf_8_sig', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(item)
def img_downloader(self, url, name):
if not os.path.exists(self.img_path):
os.mkdir(self.img_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
response = requests.get(url, headers=headers)
with open('{}/{}'.format(self.img_path, str(self.flag)+'-'+name+'.jpg'), 'wb') as f:
f.write(response.content)
self.flag += 1
@property
def time(self):
return '总共用时:{}秒'.format(self.runtime)
if __name__ == '__main__':
spider = Spider()
spider.run()