scrapy startproject musicspider
进入项目目录musicspider后创建spider
cd musicspider
scrapy genspider musiclist music.163.com
import scrapy
class MusicspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 排行榜名称和urls列表
toplistname = scrapy.Field()
toplisturls = scrapy.Field()
# 音乐名称和urls列表
musicname = scrapy.Field()
musicurls = scrapy.Field()
pass
# -*- coding: utf-8 -*-
import scrapy
import sys
#返回musicspider目录
sys.path.append('.\\.\\musicspider')
from musicspider.items import MusicspiderItem
class MusiclistSpider(scrapy.Spider):
name = 'musiclist'
allowed_domains = ['music.163.com']
start_urls = ['https://music.163.com/discover/toplist']
def parse(self, response):
items = []
#xpath方法抓取排行榜名称和URLS
toplistname = response.xpath('//div[@class="item f-cb"]/p/a/text()').extract()
urls = response.xpath('//div[@class="item f-cb"]/p/a/@href').extract()
# 拼接排行榜的URLS
toplisturls = []
for url in urls:
toplisturls.append(''.join(['https://music.163.com', url]))
for i in range(len(toplistname)):
item = MusicspiderItem()
item['toplistname'] = toplistname[i]
item['toplisturls'] = toplisturls[i]
items.append(item)
# 遍历排行榜URLS,获取歌单信息
for item in items:
yield scrapy.Request(url = item['toplisturls'], meta = {
'meta_1':item}, callback = self.second_parse)
# 获取歌单信息
def second_parse(self, response):
items = []
meta_1 = response.meta['meta_1']
musicname = response.xpath('//div/ul[@class="f-hide"]/li/a/text()').extract()
urls = response.xpath('//div/ul[@class="f-hide"]/li/a/@href').extract()
musicurls = []
#拼接歌曲URLS
for url in urls:
musicurls.append(''.join(['https://music.163.com', url]))
for i in range(len(musicname)):
item = MusicspiderItem()
item['toplistname'] = meta_1['toplistname']
item['toplisturls'] = meta_1['toplisturls']
item['musicname'] = musicname[i]
item['musicurls'] = musicurls[i]
items.append(item)
yield item
import xlrd
import xlwt
import os
from xlutils.copy import copy
class MusicspiderPipeline(object):
def process_item(self, item, spider):
# 初始化sheetname
sheetnames = []
filename = 'RankingList.xls'
# 判断是否存在写入文件,不存在则新建filename文件,存在则继续写入
if os.path.exists(filename):
file = xlrd.open_workbook(filename, formatting_info = True)
# 遍历sheet页名称
for sheet in file.sheets():
sheetnames.append(sheet.name)
# 判断toplistname是否已存在sheetname中,存在即获取当前已写入的行数,继续写入;不存在即新建toplistname的sheet页
if item['toplistname'] in sheetnames:
nrows = file.sheet_by_name(item['toplistname']).nrows
summary = copy(file)
sheet = summary.get_sheet(item['toplistname'])
style = self.setXlsStyle(sheet)
sheet.write(nrows, 0, item['musicname'], style)
sheet.write(nrows, 1, item['musicurls'], style)
summary.save(filename)
else:
summary = copy(file)
sheet = summary.add_sheet(item['toplistname'])
style = self.setXlsStyle(sheet)
sheet.write(0, 0, item['musicname'], style)
sheet.write(0, 1, item['musicurls'], style)
summary.save(filename)
else:
file = xlwt.Workbook()
sheet = file.add_sheet(item['toplistname'])
style = self.setXlsStyle(sheet)
sheet.write(0, 0, item['musicname'], style)
sheet.write(0, 1, item['musicurls'], style)
file.save(filename)
# 设置表格格式
def setXlsStyle(self, sheet):
style = xlwt.XFStyle()
al = xlwt.Alignment()
al.horz = 0x01
al.vert = 0x01
style.Alignment = al
font = xlwt.Font()
font.name = u'微软雅黑' #字体
font.bold = True #加粗
font.underline = False #下划线
font.italic = False #斜体
font.height = 200
style.font = font
pattern = xlwt.Pattern()
pattern.pattern = xlwt.Pattern.SOLID_PATTERN
pattern.pattern_fore_colour = 1
style.pattern = pattern
sheet.col(0).width = 6300
sheet.col(1).width = 12300
return style
BOT_NAME = 'musicspider'
SPIDER_MODULES = ['musicspider.spiders']
NEWSPIDER_MODULE = 'musicspider.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'musicspider.pipelines.MusicspiderPipeline': 300,
}
scrapy crawl musiclist