使用selenium与无头Chrome爬取携程酒店信息

# -*- coding: UTF-8 -*-
import re
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymongo


def element_find(xpath):
    # 判断元素是否加载完成
    try:
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, xpath)))
    except Exception:
        driver.quit()
        print("元素加载超时")
    return element


def get_hotel_list(city_id, arrive_time, leave_time):
    # 得到地图页
    driver.get("http://hotels.ctrip.com/international//" + city_id)
    element_find('//*[@id="txtCheckIn"]').clear()
    time.sleep(2)
    element_find('//*[@id="txtCheckIn"]').send_keys(arrive_time)
    time.sleep(2)
    element_find('//*[@id="txtCheckOut"]').clear()
    time.sleep(2)
    element_find('//*[@id="txtCheckOut"]').send_keys(leave_time)
    time.sleep(2)
    element_find('//*[@id="side_inner"]/div[1]/div[1]/a').click()
    time.sleep(2)


def get_info(res):
    # 用于判断酒店信息存在
    pattern = re.compile(r'
') result = pattern.findall(res) return result def map_biggest(): # 将地图放大 driver.switch_to.frame(0) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() time.sleep(1) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() time.sleep(1) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() time.sleep(1) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() time.sleep(1) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() time.sleep(1) element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click() driver.switch_to.default_content() def get_id(res): # 正则匹配酒店id pattern = re.compile(r'id=".*?"') result = pattern.findall(res) return result[0][4:-1] def get_title(res): # 正则匹配酒店名 pattern = re.compile(r'title=".*?" href') result = pattern.findall(res) return result[0][7:-6] def get_price(res): # 正则匹配酒店价格 pattern = re.compile(r'price=.*?" curr') result = pattern.findall(res) return result[0][7:-6] def get_lat(res): # 正则匹配酒店纬度 pattern = re.compile(r'pos=".*?\|') result = pattern.findall(res) return result[0][5:-1] def get_lng(res): # 正则匹配酒店经度 pattern = re.compile(r'\|.*?"') result = pattern.findall(res) return result[0][1:-1] def get_pos(res): # 正则匹配酒店经纬度 pattern = re.compile(r'pos=".*?"') result = pattern.findall(res) return result[0][5:-1] def get_mark(res): # 正则匹配酒店分数 pattern = re.compile(r'"b">.*?') result = pattern.findall(res) return result[0][4:-7] def get_url(res): # 正则匹配酒店url pattern = re.compile(r'href=".*?"') result = pattern.findall(res) return result[0][6:-1] def get_hotel(res, total_count, page): # 获得酒店全部信息并储存至数据库 for i in range(0, len(res)): time.sleep(10) element_find('//*[@id="' + get_id(res[i]) + '"]/div/span').click() if i == 3 or 4: element_find('//*[@id="' + get_id(res[i]) + '"]/div/span').click() time.sleep(2) driver.switch_to.frame("mapIframe") time.sleep(2) img_url = element_find('//*[@id="map"]/div/div/div[1]/div[3]/div/div[4]/div/div/div[1]/div/img').get_attribute('src') time.sleep(2) address = element_find('//*[@id="map"]/div/div/div[1]/div[3]/div/div[4]/div/div/div[2]/div[3]').text time.sleep(2) driver.switch_to.default_content() hotel_info = {"_id": get_id(res[i]), "rank":get_mark(res[i]), "city_name": "首尔", "city_id": "seoul274" , "pos": get_pos(res[i]), "address": address, "id": get_id(res[i]), "link": get_url(res[i]) , "area_name": "首尔", "areacode": "kr", "citycode": "seoul", "pic_url": img_url, "title": get_title(res[i]) , "paiming": int(page-1)*10+i+1, "total_page": total_count//10, "page": int(page), "total_count": total_count } time.sleep(2) save_to_mongo(hotel_info) time.sleep(2) def save_to_mongo(result): # 数据储存到mongodb try: if db[MONGO_TABLE].insert_one(result): print('stroe succeed', result) except Exception : print('store failed', result) MONGO_URl = 'localhost:27017' MONGO_DB = 'xiecheng' client = pymongo.MongoClient(MONGO_URl) db = client[MONGO_DB] MONGO_TABLE = 'seoul274' if __name__ == "__main__": chrome_options = Options() chrome_options.add_argument('--headless') driver = webdriver.Chrome(options=chrome_options) # driver = webdriver.Chrome() driver.maximize_window() get_hotel_list("seoul274", "2019-09-16", "2019-09-17") # 换至地图页 handle = driver.current_window_handle handles = driver.window_handles for new_handle in handles: if new_handle != handle: driver.switch_to.window(new_handle) # 获得总酒店数量与页数 total_count = int(element_find('//*[@id="J_totalHotel"]').text) pages = total_count // 10 pag = 1 while pag != pages: print("page: " + str(pag)) time.sleep(2) page = driver.page_source res = get_info(page) if res == []: print("hotels disappear!") time.sleep(10) driver.get("https://hotels.ctrip.com/international/maplist/seoul274/p" + str(pag)) time.sleep(10) else: time.sleep(2) map_biggest() get_hotel(res, total_count, pag) pag = pag + 1 time.sleep(2) element_find('//*[@id="c_page_mini_next"]').click() time.sleep(10) driver.quit()

你可能感兴趣的:(使用selenium与无头Chrome爬取携程酒店信息)