1.items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pic = scrapy.Field()
title = scrapy.Field()
detail_url = scrapy.Field()
price = scrapy.Field()
publish_info = scrapy.Field()
pic_list = scrapy.Field()
house_code = scrapy.Field()
ucid = scrapy.Field()
agent_name = scrapy.Field()
agent_phone = scrapy.Field()
2.lianjia.py
# -*- coding: utf-8 -*-
import scrapy
from LianJia.items import LianjiaItem
import re
import json
import requests
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
allowed_domains = ['lianjia.com']
start_urls = ['https://www.lianjia.com/city/']
def parse(self, response):
# 获取到的是新房的url
city_url_list = response.xpath("//div[@class='city_province']//li/a/@href").extract()
# print(city_url_list)
city_name_list = response.xpath("//div[@class='city_province']//li/a/text()").extract()
for index in range(len(city_url_list)):
city_name = city_name_list[index]
city_url = city_url_list[index]
# print(city_url)
# 城市首字母
city_alp = re.findall(r"https://(\w*).", city_url)[0]
# print(city_alp)
# 拼接租房城市url
city_url = "https://" + city_alp + ".lianjia.com/zufang/"
# print("--------------------{}开始下载-------------------------------".format(city_name))
yield scrapy.Request(url=city_url, callback=self.get_area_url)
def get_area_url(self, response):
# print(response.body.decode("utf-8"))
# 获取城区url
area_url_list = response.xpath("//li[@data-type='district'][position()>1]/a/@href").extract()
for area_url in area_url_list:
area_url = re.findall(r"(.*)/zufang/", response.url)[0] + area_url
# print(area_url)
yield scrapy.Request(url=area_url, callback=self.get_business_url)
def get_business_url(self, response):
# 获取商圈url
business_url_list = response.xpath("//li[@data-type='bizcircle'][position()>1]/a/@href").extract()
# print(business_url_list)
for business_url in business_url_list:
business_url = re.findall(r"(.*)/zufang/", response.url)[0] + business_url
# print(business_url)
yield scrapy.Request(url=business_url, callback=self.get_page_url)
def get_page_url(self, response):
# 获取最大页码
max_page = response.xpath("//div[@class='content__pg']/@data-totalpage").extract()
max_page = int(max_page[0]) if max_page else 0
# print(max_page)
# 遍历最大页 拼接完整的page_url
# ---------page=0时 不会执行下面----------
for page in range(max_page):
page_url = response.url + "pg{}/#contentList".format(page + 1)
# print(page_url)
yield scrapy.Request(url=page_url, callback=self.get_page_data)
def get_page_data(self, response):
# 缩小范围
fang_xml_list = response.xpath("//div[@class='content__list']/div")
# print(fang_xml_list)
for fang_xml in fang_xml_list:
# 获取图片
pic = fang_xml.xpath(".//img/@data-src").extract()
pic = pic[0] if pic else ''
# print(pic)
# 获取标题
title = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/text()").extract()[0].strip()
# 获取详情url
detail_url = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/@href").extract()[0]
detail_url = "https://bj.lianjia.com" + detail_url
# print(title)
# 获取价格
price = fang_xml.xpath(".//em/text()").extract()[0]
# print(price)
item = LianjiaItem()
item["pic"] = pic
item["title"] = title
item["detail_url"] = detail_url
item["price"] = price
# print(item)
yield scrapy.Request(url=detail_url, callback=self.get_detail_data, meta={"data": item}, dont_filter=True)
def get_detail_data(self, response):
item = response.meta["data"]
# 获取发布信息
publish_info = response.xpath("//ul/li[contains(text(), '发布')]/text()").extract()
publish_info = publish_info[0] if publish_info else ''
# print(publish_info)
# 获取图片信息
pic_list = response.xpath("//ul[@class='content__article__slide__wrapper']/div/img/@data-src").extract()
# print(pic_list)
# 获取房源编号
house_code = re.findall(r"/zufang/(.*?).html", response.url)[0]
# print(house_code)
# 获取ucid
ucid = response.xpath("//span[@class='contact__im im__online']/@data-im_id").extract()
# print(ucid)
if ucid:
ucid = ucid[0]
# 拼接完整的经纪人接口
brokers_url = "https://bj.lianjia.com/zufang/aj/house/brokers?"
agent_api = brokers_url + "house_codes={}&position=bottom&ucid={}".format(house_code, ucid)
# print(agent_api)
item["publish_info"] = publish_info
item["pic_list"] = pic_list
item["house_code"] = house_code
item["ucid"] = ucid
yield scrapy.Request(url=agent_api, callback=self.get_agent_data, meta={"data": item}, dont_filter=True)
# 获取经纪人信息
def get_agent_data(self, response):
# 将response对象转成json对象
result = response.body.decode("utf-8")
json_data = json.loads(result)
# print(json_data)
item = response.meta["data"]
house_code = item.get("house_code")
# 经纪人姓名
agent_name = json_data.get("data").get(house_code).get(house_code).get("contact_name")
# print(agent_name)
# 经纪人电话
agent_phone = json_data.get("data").get(house_code).get(house_code).get("tp_number")
# print(agent_phone)
item["agent_name"] = agent_name
item["agent_phone"] = agent_phone
yield item
3.pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import pymysql
import redis
import json
# class LianjiaPipeline(object):
# def __init__(self):
# self.count = 1
#
# def process_item(self, item, spider):
# print(self.count, dict(item))
# self.count += 1
# return item
# 插入redis数据库
class RedisPipeline(object):
def __init__(self):
self.count = 1
self.r = redis.Redis(host="localhost", port="6379", db=3)
def process_item(self, item, spider):
# print(item)
item_dict = dict(item)
print(self.count, item_dict)
item_str = json.dumps(item_dict)
self.r.lpush("lianjia", item_str)
# print("insert successfully")
self.count += 1
return item
# 插入mongodb
class MongodbPipeline(object):
def __init__(self):
mongo_client = pymongo.MongoClient("localhost", 27017)
self.db = mongo_client.lianjia
def process_item(self, item, spider):
item = dict(item)
self.db.lianjia.insert(item)
return item
# 插入mysql
class MysqlPipeline(object):
def __init__(self):
self.conn = pymysql.connect("localhost", "root", "123456", "lianjia")
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
title = item["title"]
# print(title)
price = item["price"]
agent_name = item["agent_name"]
agent_phone = item["agent_phone"]
import time
# 时间戳 数据监控时用
refresh_time = int(time.time())
sql = "insert into fang(title, price, agent_name, agent_phone, refresh_time) values('{}', '{}', '{}', '{}', '{}')"\
.format(title, price, agent_name, agent_phone, refresh_time)
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def __del__(self):
self.cursor.close()
self.conn.close()
4.main.py (项目执行脚本)
from scrapy import cmdline
cmdline.execute("scrapy crawl lianjia --nolog".split())
# cmdline.execute("scrapy crawl lianjia".split())