使用SCRAPY框架获取网易云排行榜歌单

SCRAPY框架文件

  • 1.创建项目musicspider
  • 2.创建Spider
  • 3.编写项目文件
    • items.py (定义要抓取的数据)
    • musiclist.py (编写提取item数据的spider)
    • pipelines.py(处理pipeline管道文件)
    • settings.py(配置setting文件)
  • 4.执行爬虫

1.创建项目musicspider

scrapy startproject musicspider

2.创建Spider

进入项目目录musicspider后创建spider

cd musicspider
scrapy genspider musiclist music.163.com

3.编写项目文件

items.py (定义要抓取的数据)

import scrapy

class MusicspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    # 排行榜名称和urls列表
    toplistname = scrapy.Field()
    toplisturls = scrapy.Field()

    # 音乐名称和urls列表
    musicname = scrapy.Field()
    musicurls = scrapy.Field()

    pass

musiclist.py (编写提取item数据的spider)

# -*- coding: utf-8 -*-
import scrapy
import sys
#返回musicspider目录
sys.path.append('.\\.\\musicspider')
from musicspider.items import MusicspiderItem


class MusiclistSpider(scrapy.Spider):
    name = 'musiclist'
    allowed_domains = ['music.163.com']
    start_urls = ['https://music.163.com/discover/toplist']

    def parse(self, response):
        items = []
        #xpath方法抓取排行榜名称和URLS
        toplistname = response.xpath('//div[@class="item f-cb"]/p/a/text()').extract()
        urls = response.xpath('//div[@class="item f-cb"]/p/a/@href').extract()

        # 拼接排行榜的URLS
        toplisturls = []
        for url in urls:
            toplisturls.append(''.join(['https://music.163.com', url]))

        for i in range(len(toplistname)):
            item = MusicspiderItem()
            item['toplistname'] = toplistname[i]
            item['toplisturls'] = toplisturls[i]
            items.append(item)

        # 遍历排行榜URLS,获取歌单信息
        for item in items:
            yield scrapy.Request(url = item['toplisturls'], meta = {
     'meta_1':item}, callback = self.second_parse)

    # 获取歌单信息
    def second_parse(self, response):
        items = []
        meta_1 = response.meta['meta_1']
        
        musicname = response.xpath('//div/ul[@class="f-hide"]/li/a/text()').extract()
        urls = response.xpath('//div/ul[@class="f-hide"]/li/a/@href').extract()
        
        musicurls = []
        #拼接歌曲URLS
        for url in urls:
            musicurls.append(''.join(['https://music.163.com', url]))
        
        for i in range(len(musicname)):
            item = MusicspiderItem()
            item['toplistname'] = meta_1['toplistname']
            item['toplisturls'] = meta_1['toplisturls']
            item['musicname'] = musicname[i]
            item['musicurls'] = musicurls[i]
            items.append(item)

            yield item

pipelines.py(处理pipeline管道文件)

import xlrd
import xlwt
import os
from xlutils.copy import copy

class MusicspiderPipeline(object):
    def process_item(self, item, spider):
        # 初始化sheetname
        sheetnames = []
        filename = 'RankingList.xls'
        # 判断是否存在写入文件,不存在则新建filename文件,存在则继续写入
        if os.path.exists(filename):
            file = xlrd.open_workbook(filename, formatting_info = True)
            # 遍历sheet页名称
            for sheet in file.sheets():
                sheetnames.append(sheet.name)
            # 判断toplistname是否已存在sheetname中,存在即获取当前已写入的行数,继续写入;不存在即新建toplistname的sheet页
            if item['toplistname'] in sheetnames:
                nrows = file.sheet_by_name(item['toplistname']).nrows
                summary = copy(file)
                sheet = summary.get_sheet(item['toplistname'])
                style = self.setXlsStyle(sheet)
                sheet.write(nrows, 0, item['musicname'], style)
                sheet.write(nrows, 1, item['musicurls'], style)
                summary.save(filename)
            else:
                summary = copy(file)
                sheet = summary.add_sheet(item['toplistname'])
                style = self.setXlsStyle(sheet)
                sheet.write(0, 0, item['musicname'], style)
                sheet.write(0, 1, item['musicurls'], style)
                summary.save(filename)
        else:
            file = xlwt.Workbook()
            sheet = file.add_sheet(item['toplistname'])
            style = self.setXlsStyle(sheet)
            sheet.write(0, 0, item['musicname'], style)
            sheet.write(0, 1, item['musicurls'], style)
            file.save(filename)

    # 设置表格格式
    def setXlsStyle(self, sheet):
        style = xlwt.XFStyle()

        al = xlwt.Alignment()
        al.horz = 0x01
        al.vert = 0x01
        style.Alignment = al

        font = xlwt.Font()
        font.name = u'微软雅黑'    	#字体
        font.bold = True            #加粗
        font.underline = False      #下划线
        font.italic = False         #斜体
        font.height = 200
        style.font = font

        pattern = xlwt.Pattern()
        pattern.pattern = xlwt.Pattern.SOLID_PATTERN
        pattern.pattern_fore_colour = 1
        style.pattern = pattern

        sheet.col(0).width = 6300
        sheet.col(1).width = 12300

        return style


settings.py(配置setting文件)

BOT_NAME = 'musicspider'

SPIDER_MODULES = ['musicspider.spiders']
NEWSPIDER_MODULE = 'musicspider.spiders'

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
     
   'musicspider.pipelines.MusicspiderPipeline': 300,
}

4.执行爬虫

scrapy crawl musiclist

你可能感兴趣的:(scrapy框架,python,scrapy框架)