房天下全国658个城市新房,二手房爬取

房天下北京二手房分布式抓取:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider


class LianjiaSpider(RedisCrawlSpider):
    name = 'ftx'
    allowed_domains = ['esf.fang.com']
    #start_urls=['https://esf.fang.com']
    redis_key = 'ftx'

    rules = (
        Rule(LinkExtractor(allow='https://esf.fang.com/house/i\d+/')),
        Rule(LinkExtractor(allow='https://esf.fang.com/chushou/.*'), callback='parse_detail',follow=True),
            )

    def parse_detail(self, response):
        item = {}
        item['title'] = response.xpath('//*[@class="title floatl"]/text()').extract_first().strip().replace('\r\n','')
        item['price'] = response.xpath('string(//div[@class="trl-item_top"]/div[1])').extract_first()
        item['area'] = response.xpath('//div[@class="tt"]/text()').extract_first().strip().replace('\r\n','')
        #print(item)
        return item

 

ftx_spider:全国城市主页开始遍历城市提取url,解析各个城市的新房、二手房的基本信息

import scrapy,re
from fangtx.items import NewHouseItem,ESFHouseItem

# https://www.cnblogs.com/derek1184405959/p/9446544.html


class FtxSpider(scrapy.Spider):
    name = 'ftx'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        trs = response.xpath('//div[@class="outCont"]//tr')
        province=None
        for tr in trs:
            tds=tr.xpath('.//td[not(@class)]')
            province_td=tds[0]
            provice_text = province_td.xpath(".//text()").get()
            province_text=re.sub(r'\s','',provice_text)
            if province_text:
                province=province_text
            # 排除海外城市
            if province == '其它':
                continue

            city_td=tds[1]
            city_links = city_td.xpath(".//a")
            for city in city_links:
                city_name=city.xpath('.//text()').extract_first()
                city_url=city.xpath('.//@href').extract_first()
                # print('省份:',province)
                # print('城市:',city_name)
                # print('城市url:',city_url)
                url_module = city_url.split("//")
                scheme = url_module[0]     #http:
                domain = url_module[1]     #cq.fang.com/
                if 'bj' in domain:
                    newhouse_url = ' http://newhouse.fang.com/house/s/'
                    esf_url = ' http://esf.fang.com/'
                else:
                    #新房url
                    newhouse_url = scheme + '//' + "newhouse." + domain + "house/s/"
                    #二手房url
                    esf_url = scheme + '//' + "esf." + domain + "house/s/"
                # print('城市:%s%s'%(province,city_name))
                # print("新房链接:",newhouse_url)
                # print("二手房链接:",esf_url)
                yield scrapy.Request(url=newhouse_url,callback = self.parse_newhouse, meta = {'info':(province,city_name)})

                yield scrapy.Request(url=esf_url,callback=self.parse_esf,meta={'info': (province, city_name)})

    def parse_newhouse(self, response):
        # 新房
        provice, city = response.meta.get('info')
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()").get()
            if name:
                name = re.sub(r"\s", "", name)
                # 居室
                house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
                house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))
                rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
                # 面积
                area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
                area = re.sub(r"\s|-|/", "", area)
                # 地址
                address = li.xpath(".//div[@class='address']/a/@title").get()
                address = re.sub(r"[请选择]", "", address)
                sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
                price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
                price = re.sub(r"\s|广告", "", price)
                # 详情页url
                origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()

                item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,
                    sale=sale,price=price,origin_url=origin_url,provice=provice,city=city)
                print(item)

               # yield item
        # 下一页
        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,meta={'info': (provice, city)})

    def parse_esf(self, response):
        # 二手房
        provice, city = response.meta.get('info')
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem(provice=provice, city=city)
            name = dl.xpath(".//span[@class='tit_shop']/text()").get()
            if name:
                infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
                infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
                for info in infos:
                    if "厅" in info:
                        item["rooms"] = info
                    elif '层' in info:
                        item["floor"] = info
                    elif '向' in info:
                        item['toward'] = info
                    elif '㎡' in info:
                        item['area'] = info
                    elif '年建' in info:
                        item['year'] = re.sub("年建", "", info)
                item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
                # 总价
                item['price'] = "".join(dl.xpath(".//span[@class='red']//text()").getall())
                # 单价
                item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()
                item['name'] = name
                detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
                item['origin_url'] = response.urljoin(detail)
                print(item)

                #yield item
        # 下一页
        next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={'info': (provice, city)})

 

 

二手房抓取:房天下,链家,安居客。。。进行数据分析与挖掘

数据的抓取:

headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

def detail_html(url):
    response = requests.get(url, headers=headers)
    response = response.content.decode('gbk')
    result = etree.HTML(response)
    title_list=result.xpath('//*[@class="shop_list shop_list_4"]/dl')
    for title in title_list:
        item={}
        name=title.xpath('.//*[@class="clearfix"]/a/@title')
        item['name']=name[0] if len(name)>0 else None
        item['style']=title.xpath('string(.//p[@class="tel_shop"])').strip().replace('\r\n','').replace(' ','')
        item['price']=title.xpath('string(.//span[@class="red"])')
        item['place']=title.xpath('.//p[@class="add_shop"]//span/text()')
        item['place'] = item['place'][0] if len(name) > 0 else None
        item['house_name']=title.xpath('.//p[@class="add_shop"]/a/@title')
        item['house_name'] = item['house_name'][0] if len(name) > 0 else None
        print(item)

def main():
    for i in range(1,101):
        url = 'https://hz.esf.fang.com/house/i3{}/'.format(i)
        detail_html(url)

if __name__=='__main__':
    main()

数据分析:

待续。。。。

 

 

 

你可能感兴趣的:(爬虫,数据分析)