链家二手房信息抓取

spider如下:

# -*- coding: utf-8 -*-
import scrapy


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://bj.lianjia.com/ershoufang/pg{}/'.format(num) for num in range(1,101)]

    def parse(self, response):
        hrefs=response.xpath('//div[@class="info clear"]/div[@class="title"]/a/@href').extract()
        for href in hrefs:
            yield scrapy.FormRequest(href,callback=self.parse_info)
    def parse_info(self,response):
        #print(response.text)

        price=response.xpath('//span[@class="total"]/text()').extract_first()
        unitPrice=response.xpath('//span[@class="unitPriceValue"]/text()').extract_first()
        hu_xing=response.xpath('//div[@class="houseInfo"]/div[@class="room"]/div[1]/text()').extract_first()
        height_num=response.xpath('//div[@class="houseInfo"]/div[@class="room"]/div[2]/text()').extract_first()
        direction=response.xpath('//div[@class="houseInfo"]/div[@class="type"]/div[1]/text()').extract_first()
        zhuang_xiu=response.xpath('//div[@class="houseInfo"]/div[@class="type"]/div[2]/text()').extract_first()
        area_num=response.xpath('//div[@class="houseInfo"]/div[@class="area"]/div[1]/text()').extract_first()
        age=response.xpath('//div[@class="houseInfo"]/div[@class="area"]/div[2]/text()').extract_first()
        xiao_qu=response.xpath('//div[@class="aroundInfo"]/div[@class="communityName"]/a[1]/text()').extract_first()
        area_name=response.xpath('//div[@class="aroundInfo"]/div[@class="areaName"]/span[2]/a[1]/text()').extract_first()
        detail_address=response.xpath('//div[@class="aroundInfo"]/div[@class="areaName"]/span[2]/a[2]/text()').extract_first()
        huan_num=response.xpath('//div[@class="areaName"]/span[@class="info"]').xpath('string(.)').extract_first().replace(area_name,'').replace(detail_address,'').replace('\xa0','')
        subway=response.xpath('//a[@class="supplement"]/text()').extract_first()
        yield {
            'price':price,
            'unitPrice':unitPrice,
            'hu_xing':hu_xing,
            'height_num':height_num,
            'direction':direction,
            'zhuang_xiu':zhuang_xiu,
            'area_num':area_num,
            'age':age,
            'xiao_qu':xiao_qu,
            'area_name':area_name,
            'detail_address':detail_address,
            'huan_num':huan_num,
            'subway':subway
        }

setting如下:

from fake_useragent import UserAgent

BOT_NAME = 'job51'

SPIDER_MODULES = ['job51.spiders']
NEWSPIDER_MODULE = 'job51.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent(verify_ssl=False).random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'job51.middlewares.Job51SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'job51.middlewares.Job51DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'job51.pipelines.MongoPipeline': 300,
}

pipeline如下:

import pymongo

class MongoPipeline(object):
    def open_spider(self,spider):
        self.client=pymongo.MongoClient()
    def process_item(self, item, spider):
        self.client.job.nlp.insert_one(item)
        return item
    def close_spider(self,spider):
        log_stats=str(spider.crawler.stats.get_stats())
        self.client.close()
        print(log_stats)

start如下:

from scrapy.cmdline import execute
execute('scrapy crawl job'.split())

 

你可能感兴趣的:(爬虫)