房天下北京二手房分布式抓取:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
class LianjiaSpider(RedisCrawlSpider):
name = 'ftx'
allowed_domains = ['esf.fang.com']
#start_urls=['https://esf.fang.com']
redis_key = 'ftx'
rules = (
Rule(LinkExtractor(allow='https://esf.fang.com/house/i\d+/')),
Rule(LinkExtractor(allow='https://esf.fang.com/chushou/.*'), callback='parse_detail',follow=True),
)
def parse_detail(self, response):
item = {}
item['title'] = response.xpath('//*[@class="title floatl"]/text()').extract_first().strip().replace('\r\n','')
item['price'] = response.xpath('string(//div[@class="trl-item_top"]/div[1])').extract_first()
item['area'] = response.xpath('//div[@class="tt"]/text()').extract_first().strip().replace('\r\n','')
#print(item)
return item
ftx_spider:全国城市主页开始遍历城市提取url,解析各个城市的新房、二手房的基本信息
import scrapy,re
from fangtx.items import NewHouseItem,ESFHouseItem
# https://www.cnblogs.com/derek1184405959/p/9446544.html
class FtxSpider(scrapy.Spider):
name = 'ftx'
allowed_domains = ['fang.com']
start_urls = ['https://www.fang.com/SoufunFamily.htm']
def parse(self, response):
trs = response.xpath('//div[@class="outCont"]//tr')
province=None
for tr in trs:
tds=tr.xpath('.//td[not(@class)]')
province_td=tds[0]
provice_text = province_td.xpath(".//text()").get()
province_text=re.sub(r'\s','',provice_text)
if province_text:
province=province_text
# 排除海外城市
if province == '其它':
continue
city_td=tds[1]
city_links = city_td.xpath(".//a")
for city in city_links:
city_name=city.xpath('.//text()').extract_first()
city_url=city.xpath('.//@href').extract_first()
# print('省份:',province)
# print('城市:',city_name)
# print('城市url:',city_url)
url_module = city_url.split("//")
scheme = url_module[0] #http:
domain = url_module[1] #cq.fang.com/
if 'bj' in domain:
newhouse_url = ' http://newhouse.fang.com/house/s/'
esf_url = ' http://esf.fang.com/'
else:
#新房url
newhouse_url = scheme + '//' + "newhouse." + domain + "house/s/"
#二手房url
esf_url = scheme + '//' + "esf." + domain + "house/s/"
# print('城市:%s%s'%(province,city_name))
# print("新房链接:",newhouse_url)
# print("二手房链接:",esf_url)
yield scrapy.Request(url=newhouse_url,callback = self.parse_newhouse, meta = {'info':(province,city_name)})
yield scrapy.Request(url=esf_url,callback=self.parse_esf,meta={'info': (province, city_name)})
def parse_newhouse(self, response):
# 新房
provice, city = response.meta.get('info')
lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
for li in lis:
name = li.xpath(".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()").get()
if name:
name = re.sub(r"\s", "", name)
# 居室
house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))
rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
# 面积
area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
area = re.sub(r"\s|-|/", "", area)
# 地址
address = li.xpath(".//div[@class='address']/a/@title").get()
address = re.sub(r"[请选择]", "", address)
sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
price = re.sub(r"\s|广告", "", price)
# 详情页url
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,
sale=sale,price=price,origin_url=origin_url,provice=provice,city=city)
print(item)
# yield item
# 下一页
next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
if next_url:
yield scrapy.Request(url=response.urljoin(next_url),
callback=self.parse_newhouse,meta={'info': (provice, city)})
def parse_esf(self, response):
# 二手房
provice, city = response.meta.get('info')
dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
for dl in dls:
item = ESFHouseItem(provice=provice, city=city)
name = dl.xpath(".//span[@class='tit_shop']/text()").get()
if name:
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
for info in infos:
if "厅" in info:
item["rooms"] = info
elif '层' in info:
item["floor"] = info
elif '向' in info:
item['toward'] = info
elif '㎡' in info:
item['area'] = info
elif '年建' in info:
item['year'] = re.sub("年建", "", info)
item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
# 总价
item['price'] = "".join(dl.xpath(".//span[@class='red']//text()").getall())
# 单价
item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()
item['name'] = name
detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
item['origin_url'] = response.urljoin(detail)
print(item)
#yield item
# 下一页
next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
if next_url:
yield scrapy.Request(url=response.urljoin(next_url),
callback=self.parse_esf,
meta={'info': (provice, city)})
二手房抓取:房天下,链家,安居客。。。进行数据分析与挖掘
数据的抓取:
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
def detail_html(url):
response = requests.get(url, headers=headers)
response = response.content.decode('gbk')
result = etree.HTML(response)
title_list=result.xpath('//*[@class="shop_list shop_list_4"]/dl')
for title in title_list:
item={}
name=title.xpath('.//*[@class="clearfix"]/a/@title')
item['name']=name[0] if len(name)>0 else None
item['style']=title.xpath('string(.//p[@class="tel_shop"])').strip().replace('\r\n','').replace(' ','')
item['price']=title.xpath('string(.//span[@class="red"])')
item['place']=title.xpath('.//p[@class="add_shop"]//span/text()')
item['place'] = item['place'][0] if len(name) > 0 else None
item['house_name']=title.xpath('.//p[@class="add_shop"]/a/@title')
item['house_name'] = item['house_name'][0] if len(name) > 0 else None
print(item)
def main():
for i in range(1,101):
url = 'https://hz.esf.fang.com/house/i3{}/'.format(i)
detail_html(url)
if __name__=='__main__':
main()
数据分析:
待续。。。。