分布式爬虫的房天下实战(四)

上一节我们相当于写完了一个单机爬虫,下面将各板块的完整代码给出:

ftx.py

# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem
from fang.items import EsfItem


class FtxSpider(scrapy.Spider):
    name = 'ftx'
    allowed_domains = ['fang.com']
    start_urls = ['http://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        # 获取所有的tr标签
        trs = response.xpath("//div[@class='outCont']//tr")
        province = None  # 先让省份为None,后续对省份赋值
        for tr in trs:
            tds = tr.xpath(".//td[not(@class)]")
            province_id = tds[0]
            province_text = province_id.xpath(".//text()").get()
            province_text = re.sub(r'\s', "", province_text)
            if province_text:
                province = province_text  # 再给省份赋值
            # 不抓取海外的房价信息
            if province == "其它":
                continue

            city_td = tds[1]
            city_links = city_td.xpath(".//a")
            for city_link in city_links:
                city = city_link.xpath(".//text()").get()  # 获取城市名称

                city_url = city_link.xpath("./@href").get()  # 获取城市链接
                url_moudle = city_url.split(".")
                scheme = url_moudle[0]
                domain = url_moudle[1]
                last = url_moudle[2]
                if "bj" in scheme:  # 构建北京的新房和二手房的url链接
                    newhouse_url = "https://newhouse.fang.com/house/s/"
                    esf_url = "https://esf.fang.com/"
                else:
                    # 构建城市新房链接
                    newhouse_url = scheme + ".newhouse." + domain + "." + last + "house/s/"
                    # 构建城市二手房链接
                    esf_url = scheme + ".esf." + domain + '.' + last
                # 构建生成器,将新房的详细信息传递个下一个函数数获取
                yield scrapy.Request(
                    url=response.urljoin(newhouse_url),
                    callback=self.parse_newhouse,
                    meta={"info": (province, city)}
                )

                # 构建生成器,将二手房信息传递给另一个函数获取
                yield scrapy.Request(
                    url=response.urljoin(esf_url),
                    callback=self.parse_esf,
                    meta={"info": (province, city)}
                )

    def parse_newhouse(self, response):
        province, city = response.meta.get("info")

        lis = response.xpath("//div[contains(@class, 'nl_con')]/ul//li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a//text()").get()
            if name:
                name = name.strip()
            if name is None:
                continue
            price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r'\s|广告', '', price)
            sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
            address = li.xpath(".//div[@class='address']/a/@title").get()
            house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            house_type_list = list(map(lambda x:re.sub(r'\s', '', x), house_type_list))
            rooms = list(filter(lambda x:x.endswith("居"), house_type_list))
            area = "".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
            area = re.sub(r'\s|-|/', "", area)
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = "https:" + origin_url
            district_text =  "".join(li.xpath(".//div[@class='address']/a//text()").getall())

            district = re.search(r'.*?\[(.*?)\].*?', district_text)
            if district:
                district = district.group(1)

            item = NewHouseItem(
                area=area, rooms=rooms, address=address, origin_url=origin_url,
                name=name, sale=sale, price=price, province=province,
                city=city, district=district
                            )
            print(item)
            yield item
        next_page_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_page_url:
            yield scrapy.Request(url=response.urljoin(next_page_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})

    def parse_esf(self, response):
        province, city = response.meta.get("info")
        item = EsfItem(province=province, city=city)
        dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
        for dl in dls:
            item["name"] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "㎡" in info:
                    item["area"] = info
                elif "层" in info:
                    item["floor"] = info
                elif "向" in info:
                    item["toward"] = info
                elif "年" in info:
                    item["year"] = info
            price_str = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
            item["unit"] = "".join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall())
            item["price"] = re.sub(r"\s", "", price_str)
            item["address"] = dl.xpath(".//p[@class='add_shop']/span//text()").get()
            detail_url = dl.xpath(".//dt[@class='floatl']/a/@href").get()
            item["origin_url"] = response.urljoin(detail_url)
            print(item)
            yield item

        url = response.xpath("//div[@id='list_D10_15']/p[1]/a/@href").get()
        if url:
            next_url = response.urljoin(url)
            yield scrapy.Request(url=next_url, callback=self.parse_esf,
                                 meta={"info": (province, city)})

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NewHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 详细链接
    origin_url = scrapy.Field()
    # 地点
    address = scrapy.Field()
    # 名称
    name = scrapy.Field()
    # 是否在售
    sale = scrapy.Field()
    # 几居
    rooms = scrapy.Field()
    # 行政区
    district = scrapy.Field()

class EsfItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 总价
    price = scrapy.Field()
    # 单价
    unit = scrapy.Field()
    # 年代
    year = scrapy.Field()
    # 朝向
    toward = scrapy.Field()
    # 几室几厅
    rooms = scrapy.Field()
    # 层
    floor = scrapy.Field()
    # 详情页信息
    origin_url = scrapy.Field()

middlewares.py不变

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter

class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open("newhouse.json", "wb")
        self.esfhouse_fp = open("esfhouse.json", "wb")
        self.newhouse_expoter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
        self.esfhouse_expoter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False)


    def process_item(self, item, spider):
        self.newhouse_expoter.export_item(item)
        self.esfhouse_expoter.export_item(item)

        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()

seetings.py

# -*- coding: utf-8 -*-

# Scrapy settings for fang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'fang'

SPIDER_MODULES = ['fang.spiders']
NEWSPIDER_MODULE = 'fang.spiders'
LOG_LEVEL = "WARNING"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'fang.middlewares.FangSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'fang.middlewares.UserAgentDownloadMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'fang.pipelines.FangPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

大家只要复制粘贴,就能够运行啦,下一节讲解如何将一个单机爬虫转换成一个分布式爬虫。

你可能感兴趣的:(python,爬虫)