使用scrapy爬取北京3000条二房信息

网站:https://bj.lianjia.com/ershoufang/pg1/

部署items

import scrapy


class HomeItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hkind=scrapy.Field()       #户型
    area=scrapy.Field()     #面积
    cxiang=scrapy.Field()       #朝向
    louceng=scrapy.Field()      #楼层
    chengqu=scrapy.Field()      #城区
    qumin=scrapy.Field()        #小区名
    zprice=scrapy.Field()       #总价
    preprice=scrapy.Field()     #单价
    zhuangsi=scrapy.Field() #装饰情况
    jtime=scrapy.Field()    #建楼时间
    jiegou=scrapy.Field()   #结构

部署爬虫

import scrapy
from home.items import HomeItem
import time
class ExampleSpider(scrapy.Spider):
    name = 'example'
    # allowed_domains = ['example.com']
    start_urls = []
    for i in range(1,101):
        start_urls.append('https://bj.lianjia.com/ershoufang/pg{}/'.format(i))

    def parse(self, response):
        items=[]
        cont=0
        time.sleep(0.1)
        kind=response.xpath('//div[@class="houseInfo"]/text()').extract()       #包含户型,面积,朝向,装饰,楼层,建楼时间,楼层结构类型
        qm=response.xpath('//div[@class="positionInfo"]/a[@target="_blank"]/text()').extract()         #获取小区名,城区
        totalprice=response.xpath('//div[@class="totalPrice"]/span/text()').extract()       #获取总价,为字符类型,单位万元
        unitPrice=response.xpath('//div[@class="unitPrice"]/span/text()').extract()       #获取单价
        for i in range(30):
            item=HomeItem()
            cc=kind[i].replace(' ','').split('|')
            item['hkind']=cc[0]
            item['area']=cc[1]
            item['cxiang']=cc[2]
            item['louceng']=cc[4]
            item['zhuangsi']=cc[3]
            item['jtime']=cc[5]
            item['jiegou']=cc[6]
            item['qumin']=qm[cont]
            cont+=1
            item['chengqu']=qm[cont]
            cont+=1
            item['zprice']=totalprice[i]
            item['preprice']=unitPrice[i].replace('单价','')
            items.append(item)
        return items




部署管道

import time,codecs

class HomePipeline(object):
    def process_item(self, item, spider):
        today=time.strftime('%Y%m%d',time.localtime())
        filename=today+'.txt'
        with codecs.open(filename,'a+','utf8') as f:
            f.write("%s \t  %s \t  %s \t  %s \t  %s \t  %s \t  %s \t  %s \t  %s \t  %s \t  %s \t   \r\n "%(item['hkind'],item['area'],item['cxiang'],item['louceng'],item['zhuangsi'],item['jtime'],item['jiegou'],item['qumin'],item['chengqu'],item['zprice'],item['preprice']))
        f.close()

        return item

settings

# -*- coding: utf-8 -*-

# Scrapy settings for home project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'home'

SPIDER_MODULES = ['home.spiders']
NEWSPIDER_MODULE = 'home.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'home.middlewares.HomeSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'home.middlewares.HomeDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'home.pipelines.HomePipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

爬取结果

使用scrapy爬取北京3000条二房信息_第1张图片
本人:二手房也只能看看,不能想

你可能感兴趣的:(不限)