python爬取自如房间信息(一)

使用python和selenium+Chrome Headless爬取自如房间信息,并将结果存储在MongoDB中。其中最麻烦的应该是每间房的价格,因为自如是用一张图片和offset来显示价格,所以不能直接获得。但我们可以通过将图片转为文字,再通过偏移量将数字组合为价格。

在这里我们使用的是Chrome Headless而不是PhantomJS, 主要是因为前者不需要设置size大小,同时也更加稳定。因为PhantomJS总是会获取到跟原本元素标签不一样的值。该例子主要爬取的是深圳市南山区,地铁2号线,类型为友家合租房间信息。

在爬取信息时,如果能用逆向解析就最好用逆向解析,因为相比于用动态解析,前者更加的稳定。我们点进自如的网站,会发现每个链接不同的地方在地铁站的名字,所以在爬取时在固定链接后上地铁站名,就可以访问不同地铁站附近的房源。

其次是翻页,我们也能发现同样的规律。除了首页,其他页数都是跟在同样的链接后面。我们可以利用这个规律来进行翻页操作。

 

下面是具体的代码,对于价格的爬取,也就是文件getNumbers在python爬取自如房间信息(二)中可以看到

# -*- coding:utf-8 -*-
import time
import re
import getNumbers
import pymongo
from config import *
from urllib.request import urlretrieve
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

option = webdriver.ChromeOptions()
prefs = {'profile.managed.default_content_setting.images':2}
option.add_argument("headless")
option.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome('/Users/zhangxin/Downloads/chromedriver', chrome_options=option)

wait = WebDriverWait(browser, 10)

def get_picture(stationname, image_url, page):
    """ 获取图片 """
    picture_name = stationname + str(page) + 'ziroom.png'
    urlretrieve(url=image_url, filename='./data/'+picture_name)
    return picture_name

def get_file_content(file_path):
    """ 读取图片 """
    with open("./data/"+file_path, 'rb') as fp:
        return fp.read()


def get_price(stationname, page = 1):
    html = browser.page_source
    priceImage = re.compile('var ROOM_PRICE = (.*?);').search(html).group(1)
    priceImage = eval(priceImage)
    imageUrl = 'http:' + priceImage['image']
    imageOffset = priceImage['offset']
    imageFile = get_picture(stationname, imageUrl, page)
    print(imageFile)
    numbers = getNumbers.getNum("./data/"+imageFile)
    print(imageOffset)
    print(numbers)
    return numbers, imageOffset


def get_rooms_info(newHandle, price):
    """
    :param newHandle: the browser page of each room
    :return: rooms: the information of a room
    """
    browser.switch_to.window(newHandle)
    html = browser.page_source
    ## html中存在xmlns属性,将此属性替换为空
    html = html.replace('xmlns="http://www.w3.org/1999/xhtml"','')
    print('getting the room information')

    #print(html)
    time.sleep(2)
    doc = pq(html)
    detail = doc('.room_detail_right .detail_room')
    ## 去除属性,这个会影响到children取text,但不会影响直接取text
    detail('b').remove()
    #print(detail)
    new_detail = detail('ul').children()
    #print(new_detail[0].text)
    imagesUrl = []
    for item in doc('.lof-navigator-outer ul li .lidiv img').items():
        imagesUrl.append(item.attr('src'))
    rooms = {
            'roomname': doc('.room_name h2').text(),
            'images': imagesUrl,
            'price': price,
            'size': re.compile('(\d+.*?)').search(new_detail[0].text).group(1),
            'direction': new_detail[1].text,
            'structure': re.sub('\s','',new_detail[2].text),
            'floor': new_detail[3].text,
            'traffic': doc('.room_detail_right .detail_room .last').text().replace('\n', ' ')
        }
    print(rooms)
    save_to_mongo(rooms)

    ## 获取总的房间
def get_rooms(stationname, page = 1):
    wait.until(
        EC.presence_of_element_located((By.ID, "houseList"))
    )
    links = browser.find_elements_by_css_selector("#houseList .clearfix .txt h3 a")
    numbers, offsets = get_price(stationname, page)
    # print(links)
    count = 0
    for link in links:
        print(link.get_attribute('href'))
        ## 当前窗口信息
        handle = browser.current_window_handle
        ## 获取房间的价格
        offset = offsets[count]
        price = []
        for i in range(len(offset)):
            price.append(str(numbers[9 - offset[i]]))
        ## 打开每一个房间的详细信息窗口
        link.click()
        time.sleep(2)
        ## 所有窗口信息
        handles = browser.window_handles
        for newHandle in handles:
            if newHandle != handle:
                get_rooms_info(newHandle, "".join(price))
                browser.close()
            else:
                continue
        ## 回到主窗口
        browser.switch_to.window(handle)
        count += 1


### 翻页操作
def next_page(stationname, stationUrl, page = 1 ):
    try:
        # print('正在翻页:')
        browser.get(stationUrl+'?p='+str(page))
        print(stationUrl+'?p='+str(page))
        print("page:",page)
        get_rooms(stationname, page)
    except Exception:
        next_page(stationname, stationUrl , page)

def get_page(i):

    ### 选择具体的几个站
    stationUrl = STATION + i + ".html"
    print(stationUrl)
    browser.get(stationUrl)
        # 获取总的页数
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#page'))
    )
    html = browser.page_source
    pages = re.compile('class="next".*?\w(.*?)\w').search(html).group(1)
    print('total pages:', pages)
    get_rooms(i)
    return pages, stationUrl

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('Save  Success!!!')
    except Exception:
        print('Fail')


if __name__ == "__main__":
    for station in STATIONLIST:
        pages, stationUrl = get_page(station)
        for page in range(2, int(pages) + 1):
            next_page(station, page, stationUrl)

配置文件代码:

MONGO_URL = 'localhost'
MONGO_DB = 'ziroom'
MONGO_TABLE = 'rooms'


SERVICE_ARGS = ['--disk-cache=true']

STATION = "http://sz.ziroom.com/z/nl/z2-s2号线%28蛇口线%29-t"

STATIONLIST = ["海上世界", "水湾", "东角头","湾厦","海月", "登良", "后海", "科苑"]

 

 

你可能感兴趣的:(爬虫,Python)