Python网络爬虫--项目实战--scrapy爬取人人车

一、目标

爬取多页人人车的车辆信息

二、分析

2.1 网站分析

在网页源代码中可以搜索到页面中的数据,所以可以判断该页面为静态加载的

三、完整代码

renrenche.py

import scrapy

from car.items import RrcItem


class RenrencheSpider(scrapy.Spider):
    name = 'renrenche'
    allowed_domains = ['www.renrenche.com']
    start_urls = ['https://www.renrenche.com/bj/ershouche/?&plog_id=618ab1bbf616cab93022afa088592885']
    base_url = 'https://www.renrenche.com'
    def parse(self, response):
        selector = response.xpath('//ul[contains(@class,"row-fluid list-row js-car-list")]/li/a[not(@rel)]')
        # print(len(selector))
        # print(selector)
        for car in selector:
            car_name = car.xpath('./h3/text()').extract_first()
            total_price = car.xpath('./div[contains(@class,"tags-box")]/div/text()').extract_first().replace("\n",
                                                    "").replace(" ","") +"万"
            down_pay = car.xpath('./div[contains(@class,"tags-box")]/div/div/div/text()').extract_first()
            car_detail = car.xpath('./@href').extract_first()

            car_item = RrcItem()
            car_item['car_name'] = car_name
            car_item['car_price'] = total_price
            car_item['down_pay'] = down_pay

            yield car_item

        flag = response.xpath('//ul[contains(@class,"pagination js-pagination")][last()]/@class').extract_first()
        if not flag:
            url = response.xpath('//ul[contains(@class,"pagination js-pagination")]/li[last()]/a/@href').extract_first()
            yield scrapy.Request(url=self.base_url+url,callback=self.parse)

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import MySQLdb
from itemadapter import ItemAdapter

from car.spiders.renrenche import RenrencheSpider


class CarPipeline:
    def process_item(self, item, spider):
        return item


class RrcPipeline:
    def open_spider(self,spider):
        conn = MySQLdb.Connect(host='localhost',user='root',password='6666',port=3306,database='maiche',charset='utf8')
        cursor = conn.cursor()
        self.conn = conn
        self.cursor = cursor

    def process_item(self, item, spider):

        if isinstance(spider,RenrencheSpider):
            self.cursor.execute("insert into car(carname,totalprice,downpay) values('%s',"
                                "'%s','%s');" %(item.get('car_name'),item.get('car_price'),
                                                           item.get('down_pay')))
            self.conn.commit()

        return item

    def close_spider(self,spider):
        self.conn.close()

四、遇到的坑

1.创建数据库连接时没有加编码格式

你可能感兴趣的:(python)