scrapy+selenium获取哔哩哔哩排行榜(应援榜)(动态加载)

目标数据:

scrapy+selenium获取哔哩哔哩排行榜(应援榜)(动态加载)_第1张图片

 

 爬虫代码:

# -*- coding: utf-8 -*-
import scrapy
from bilibili_yy.items import BilibiliYyItem
import re
from selenium import webdriver
import pyperclip

class BiliSpider(scrapy.Spider):
    name = 'bili'
    # allowed_domains = ['manga.bilibili.com']
    start_urls = ['https://manga.bilibili.com/ranking?from=manga_homepage#/ouenn/']
    def __init__(self):
        self.driver = webdriver.Chrome()
    def parse(self, response):
        item = BilibiliYyItem()
        for data_s in response.xpath('//div[@class="rank-item dp-i-block border-box p-relative"]'):
            
            pmqingkuang = data_s.xpath('.//div[starts-with(@class,"rank-movement p-absolute bg-center bg-cover bg-no-repeat")]/@class').extract()[0]

            if len(data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]')) == 2:
                item['paiming']  = re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[0])[0]+ re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[1])[0]
            else:
                item['paiming']  = re.findall(r"\d", data_s.xpath('.//span[starts-with(@class,"digit-item bg-center bg-contain bg-no-repeat dp-i-block digit-")]/@class').extract()[0])[0].zfill(2)

            if 'hold' in pmqingkuang:
                item['pmqingkuang'] = '保持'
            elif 'up' in pmqingkuang:
                item['pmqingkuang'] = '上升'
            else:
                item['pmqingkuang'] = '下降'
            
            item['pic_link'] = data_s.xpath('.//div[starts-with(@class,"manga-cover bg-center bg-cover bg-no-repeat")]/@data-src').extract()[0]
            item['cartoon_link'] ='https://manga.bilibili.com'+ data_s.xpath('.//a[starts-with(@class,"dp-block manga-title")]/@href').extract()[0]
            item['name'] = data_s.xpath('.//a[starts-with(@class,"dp-block manga-title")]/text()').extract()[0]
            item['author'] = data_s.xpath('.//p[@class="fans-author-text t-over-hidden t-no-wrap"]/text()').extract()[0]
            item['fensizhi'] = data_s.xpath('.//p[@class="fans-value"]/text()').extract()[0].replace(' 万 粉丝值','')
            
            if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[2]/@title'):
                item['zhugong1'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[2]/@title').extract()[0]
            else:
                item['zhugong1'] = ''

            if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[3]/@title'):
                item['zhugong2'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[3]/@title').extract()[0]
            
            else:
                item['zhugong2'] = ''

            if data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[4]/@title'):
                item['zhugong3'] = data_s.xpath('.//div[@class="award-user-ctnr p-absolute w-100"]/div[4]/@title').extract()[0]
            else:
                item['zhugong3'] = ''
            
            
            yield item


    def close_spider(self,spider):
        print('关闭浏览器对象')
        self.driver.quit()

写出mongo:

scrapy+selenium获取哔哩哔哩排行榜(应援榜)(动态加载)_第2张图片

 

 全部文件下载:

https://download.csdn.net/download/Ferencz/12127196

 

你可能感兴趣的:(爬虫,scrapy,python)