Scrapy爬虫抓取ZOL手机详情

前不久需要一批手机数据做测试,所以就爬取了ZOL上关于手机的各项参数,现在把代码分享出来,希望大家能够多提改进意见。

ZOL手机信息

想要抓取ZOL关于手机的信息需要三个步骤:

手机商城列表页 —》单个手机详情页 ----》当前手机更多详情页面

爬虫代码

# -*- coding: gbk -*-
from scrapy.spiders import CrawlSpider
import scrapy
from urllib.parse import urljoin


class PhoneSpider(CrawlSpider):
    name = "phone"
    allowed_domains = ["detail.zol.com.cn"]

    def start_requests(self):
        for i in range(30):
            yield scrapy.Request('http://detail.zol.com.cn/cell_phone_index/subcate57_list_' + str(i + 1) + '.html',
            self.parse, 
            dont_filter=True)

    def parse(self, response): # 手机商城列表页
        phone_plane = response.css('div.pic-mode-box')
        phone_list = phone_plane.css('ul li')
        for phone in phone_list:
            phone = phone.css('h3 a[href]')
            phone_url = phone.css('a::attr(href)').extract_first()
            phone_title = phone.css("a::attr(title)").extract_first()
            next_url = urljoin(response.url, phone_url)
            yield scrapy.Request(next_url, self.get_phone_page, dont_filter=False, meta={
                'title': phone_title,
            })

    def get_phone_page(self, response): # 单个手机详情页
        section_vec = response.css('div.section div.section-content')
        next_a = section_vec.css('a._j_MP_more')
        detail_url = next_a.css('a::attr(href)').extract_first()
        next_url = urljoin(response.url, detail_url)
        yield scrapy.Request(next_url, self.get_details, dont_filter=False, meta={
            'title': response.meta['title']
        })

    def get_details(self, response): # 当前手机更多详情页面
        title = response.meta['title']
        all_content = response.css('div.detailed-parameters')
        all_content = all_content.css('tr')
        detail_list = ['' for i in range(8)]
        for content in all_content:
            # detail = content.css('th')
            if content.css('th a'):
                detail_name = content.css('th a ::text').extract_first()
            else:
                detail_name = content.css('th ::text').extract_first()
            if content.css('td a'):
                detail_content = content.css('td a ::text').extract_first()
            else:
                detail_content = content.css('td ::text').extract_first()
            if detail_name == '上市日期':
                detail_list[0] = detail_content.replace(',', ';')
            elif detail_name == '出厂系统内核':
                detail_list[1] = detail_content.replace(',', ';')
            elif detail_name == '主屏分辨率':
                detail_list[2] = detail_content.replace(',', ';')
            elif detail_name == 'CPU型号':
                detail_list[3] = detail_content.replace(',', ';')
            elif detail_name == 'GPU型号':
                detail_list[4] = detail_content.replace(',', ';')
            elif detail_name == '电池容量':
                detail_list[5] = detail_content.replace(',', ';')
            elif detail_name == '质保时间':
                detail_list[6] = detail_content.replace(',', ';')
            elif detail_name == '手机重量':
                detail_list[7] = detail_content.replace(',', ';')
        write_line = title + ',' + (",".join(detail_list))
        with open('phone_details.csv', 'a') as f:
            f.write(write_line + '\n')
        f.close()
        print('Write : ' + write_line)

你可能感兴趣的:(爬虫,scrapy)