上一小节我们已经获取到了对所有城市的新房和二手房的链接,并交给他们所对应的函数,这一小节我们将对新房和二手房的内容进行解析
首先进入新房链接页面
在items.py中定义我们所需爬取的内容名称
然后再进入新房页面,按下f12,查看网页结构,利用xpath语法,获取所需要的内容信息,这里先把代码附上:
def parse_newhouse(self, response):
province, city = response.meta.get(“info”)
lis = response.xpath("//div[contains(@class, 'nl_con')]/ul//li")
for li in lis:
name = li.xpath(".//div[@class='nlcd_name']/a//text()").get()
if name:
name = name.strip()
if name is None:
continue
price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
price = re.sub(r'\s|广告', '', price)
sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
address = li.xpath(".//div[@class='address']/a/@title").get()
house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
house_type_list = list(map(lambda x:re.sub(r'\s', '', x), house_type_list))
rooms = list(filter(lambda x:x.endswith("居"), house_type_list))
area = "".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
area = re.sub(r'\s|-|/', "", area)
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
origin_url = "https:" + origin_url
district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
district = re.search(r'.*?\[(.*?)\].*?', district_text)
if district:
district = district.group(1)
item = NewHouseItem(
area=area, rooms=rooms, address=address, origin_url=origin_url,
name=name, sale=sale, price=price, province=province,
city=city, district=district
)
print(item)
其中有很多东西都有一些乱七八糟的字符,所以我们可以使用正则表达式中的sub方法,将里面的字符给去掉,获取有用的字符信息。关于二手房的方法也是和新房的方法是一样的,所以我也就不多赘述了,下面把代码附上。
def parse_esf(self, response):
province, city = response.meta.get("info")
item = EsfItem(province=province, city=city)
dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
for dl in dls:
item["name"] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
for info in infos:
if "厅" in info:
item['rooms'] = info
elif "㎡" in info:
item["area"] = info
elif "层" in info:
item["floor"] = info
elif "向" in info:
item["toward"] = info
elif "年" in info:
item["year"] = info
price_str = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
item["unit"] = "".join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall())
item["price"] = re.sub(r"\s", "", price_str)
item["address"] = dl.xpath(".//p[@class='add_shop']/span//text()").get()
detail_url = dl.xpath(".//dt[@class='floatl']/a/@href").get()
item["origin_url"] = response.urljoin(detail_url)
print(item)
yield item
url = response.xpath("//div[@id='list_D10_15']/p[1]/a/@href").get()
if url:
next_url = response.urljoin(url)
yield scrapy.Request(url=next_url, callback=self.parse_esf,
meta={"info": (province, city)})
这里我们的代码就相当于写完了,这里再pipelines中保存一下数据观察结果:
保存的数据为json数据,在pipelies中的代码为
```python
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class FangPipeline(object):
def __init__(self):
self.newhouse_fp = open("newhouse.json", "wb")
self.esfhouse_fp = open("esfhouse.json", "wb")
self.newhouse_expoter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
self.esfhouse_expoter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False)
def process_item(self, item, spider):
self.newhouse_expoter.export_item(item)
self.esfhouse_expoter.export_item(item)
return item
def close_spider(self, spider):
self.newhouse_fp.close()
self.esfhouse_fp.close()
在settings中开启相关设置:运行爬虫,查看本地文件的结果:
这是二手房的信息:
这是新房的信息: