使用python和selenium+Chrome Headless爬取自如房间信息,并将结果存储在MongoDB中。其中最麻烦的应该是每间房的价格,因为自如是用一张图片和offset来显示价格,所以不能直接获得。但我们可以通过将图片转为文字,再通过偏移量将数字组合为价格。
在这里我们使用的是Chrome Headless而不是PhantomJS, 主要是因为前者不需要设置size大小,同时也更加稳定。因为PhantomJS总是会获取到跟原本元素标签不一样的值。该例子主要爬取的是深圳市南山区,地铁2号线,类型为友家合租房间信息。
在爬取信息时,如果能用逆向解析就最好用逆向解析,因为相比于用动态解析,前者更加的稳定。我们点进自如的网站,会发现每个链接不同的地方在地铁站的名字,所以在爬取时在固定链接后上地铁站名,就可以访问不同地铁站附近的房源。
其次是翻页,我们也能发现同样的规律。除了首页,其他页数都是跟在同样的链接后面。我们可以利用这个规律来进行翻页操作。
下面是具体的代码,对于价格的爬取,也就是文件getNumbers在python爬取自如房间信息(二)中可以看到:
# -*- coding:utf-8 -*-
import time
import re
import getNumbers
import pymongo
from config import *
from urllib.request import urlretrieve
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
option = webdriver.ChromeOptions()
prefs = {'profile.managed.default_content_setting.images':2}
option.add_argument("headless")
option.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome('/Users/zhangxin/Downloads/chromedriver', chrome_options=option)
wait = WebDriverWait(browser, 10)
def get_picture(stationname, image_url, page):
""" 获取图片 """
picture_name = stationname + str(page) + 'ziroom.png'
urlretrieve(url=image_url, filename='./data/'+picture_name)
return picture_name
def get_file_content(file_path):
""" 读取图片 """
with open("./data/"+file_path, 'rb') as fp:
return fp.read()
def get_price(stationname, page = 1):
html = browser.page_source
priceImage = re.compile('var ROOM_PRICE = (.*?);').search(html).group(1)
priceImage = eval(priceImage)
imageUrl = 'http:' + priceImage['image']
imageOffset = priceImage['offset']
imageFile = get_picture(stationname, imageUrl, page)
print(imageFile)
numbers = getNumbers.getNum("./data/"+imageFile)
print(imageOffset)
print(numbers)
return numbers, imageOffset
def get_rooms_info(newHandle, price):
"""
:param newHandle: the browser page of each room
:return: rooms: the information of a room
"""
browser.switch_to.window(newHandle)
html = browser.page_source
## html中存在xmlns属性,将此属性替换为空
html = html.replace('xmlns="http://www.w3.org/1999/xhtml"','')
print('getting the room information')
#print(html)
time.sleep(2)
doc = pq(html)
detail = doc('.room_detail_right .detail_room')
## 去除属性,这个会影响到children取text,但不会影响直接取text
detail('b').remove()
#print(detail)
new_detail = detail('ul').children()
#print(new_detail[0].text)
imagesUrl = []
for item in doc('.lof-navigator-outer ul li .lidiv img').items():
imagesUrl.append(item.attr('src'))
rooms = {
'roomname': doc('.room_name h2').text(),
'images': imagesUrl,
'price': price,
'size': re.compile('(\d+.*?)').search(new_detail[0].text).group(1),
'direction': new_detail[1].text,
'structure': re.sub('\s','',new_detail[2].text),
'floor': new_detail[3].text,
'traffic': doc('.room_detail_right .detail_room .last').text().replace('\n', ' ')
}
print(rooms)
save_to_mongo(rooms)
## 获取总的房间
def get_rooms(stationname, page = 1):
wait.until(
EC.presence_of_element_located((By.ID, "houseList"))
)
links = browser.find_elements_by_css_selector("#houseList .clearfix .txt h3 a")
numbers, offsets = get_price(stationname, page)
# print(links)
count = 0
for link in links:
print(link.get_attribute('href'))
## 当前窗口信息
handle = browser.current_window_handle
## 获取房间的价格
offset = offsets[count]
price = []
for i in range(len(offset)):
price.append(str(numbers[9 - offset[i]]))
## 打开每一个房间的详细信息窗口
link.click()
time.sleep(2)
## 所有窗口信息
handles = browser.window_handles
for newHandle in handles:
if newHandle != handle:
get_rooms_info(newHandle, "".join(price))
browser.close()
else:
continue
## 回到主窗口
browser.switch_to.window(handle)
count += 1
### 翻页操作
def next_page(stationname, stationUrl, page = 1 ):
try:
# print('正在翻页:')
browser.get(stationUrl+'?p='+str(page))
print(stationUrl+'?p='+str(page))
print("page:",page)
get_rooms(stationname, page)
except Exception:
next_page(stationname, stationUrl , page)
def get_page(i):
### 选择具体的几个站
stationUrl = STATION + i + ".html"
print(stationUrl)
browser.get(stationUrl)
# 获取总的页数
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#page'))
)
html = browser.page_source
pages = re.compile('class="next".*?\w(.*?)\w').search(html).group(1)
print('total pages:', pages)
get_rooms(i)
return pages, stationUrl
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('Save Success!!!')
except Exception:
print('Fail')
if __name__ == "__main__":
for station in STATIONLIST:
pages, stationUrl = get_page(station)
for page in range(2, int(pages) + 1):
next_page(station, page, stationUrl)
配置文件代码:
MONGO_URL = 'localhost'
MONGO_DB = 'ziroom'
MONGO_TABLE = 'rooms'
SERVICE_ARGS = ['--disk-cache=true']
STATION = "http://sz.ziroom.com/z/nl/z2-s2号线%28蛇口线%29-t"
STATIONLIST = ["海上世界", "水湾", "东角头","湾厦","海月", "登良", "后海", "科苑"]