python实现艺龙爬虫

有反爬措施暂时只能爬取一页

import requests
import re
import pandas as pd
import time

def get_data(page):
    """"""
    url = "http://hotel.elong.com/ajax/list/asyncsearch"
    data = {
        "code": "7809114",
        "listRequest.areaID": "",
        "listRequest.bookingChannel": 5,
        "listRequest.cardNo": "192928",
        "listRequest.checkInDate": "2018-03-19 00:00:00",
        "listRequest.checkOutDate": "2018-03-20 00:00:00",
        "listRequest.cityID": "0401",
        "listRequest.cityName": "重庆市",
        "listRequest.customLevel": "11",
        "listRequest.distance": "20",
        "listRequest.endLat": 0,
        "listRequest.endLng": 0,
        "listRequest.facilityIds": "",
        "listRequest.highPrice": 0,
        "listRequest.hotelBrandIDs": "",
        "listRequest.isAdvanceSave": "false",
        "listRequest.isAfterCouponPrice": "true",
        "listRequest.isCoupon": "false",
        "listRequest.isDebug": "false",
        "listRequest.isLimitTime": "false",
        "listRequest.isLogin": "false",
        "listRequest.isMobileOnly": "true",
        "listRequest.isNeed5Discount": "true",
        "listRequest.isNeedNotContractedHotel": "false",
        "listRequest.isNeedSimilarPrice": "false",
        "listRequest.isReturnNoRoomHotel": "true",
        "listRequest.isStaySave": "false",
        "listRequest.isTrace": "false",
        "listRequest.isUnionSite": "false",
        "listRequest.keywords": "",
        "listRequest.keywordsType": 0,
        "listRequest.language": "cn",
        "listRequest.listType": 0,
        "listRequest.lowPrice": 0,
        "listRequest.orderFromID": "50793",
        "listRequest.pageIndex": page,
        "listRequest.pageSize": 20,
        "listRequest.payMethod": 0,
        "listRequest.personOfRoom": 0,
        "listRequest.poiId": 0,
        "listRequest.promotionChannelCode": "0000",
        "listRequest.proxyID": "ZD",
        "listRequest.rankType": 0,
        "listRequest.returnFilterItem": "true",
        "listRequest.sellChannel": 1,
        "listRequest.seoHotelStar": 0,
        "listRequest.sortDirection": 1,
        "listRequest.sortMethod": 1,
        "listRequest.starLevels": "",
        "listRequest.startLat": 0,
        "listRequest.startLng": 0,
        "listRequest.taRecommend": "false",
        "listRequest.themeIds": "",
        "listRequest.ctripToken": "815b07a9-3f97-4ae0-965c-e8d9d3b9a57f",
        "listRequest.elongToken": "jeww06u3-7967-4a47-9e59-91d212f31e82",
    }
    header = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Content-Length": "1623",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "hotel.elong.com",
        "Origin": "http://hotel.elong.com",
        "Pragma": "no-cache",
        "Referer": "http://hotel.elong.com/chongqing/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    response = requests.post(url, data=data, headers=header)
    html = response.json()
    hotel_name = re.findall('target="_blank" title="(.*?)">', html['value']['hotelListHtml'])
    hotel_price = re.findall('(.*?)', html['value']['hotelListHtml'])
    hotel_address = re.findall('data-hoteladdress="(.*?)" >', html['value']['hotelListHtml'])
    # 返回酒店名称,酒店价格,酒店地址
    return hotel_name, hotel_price, hotel_address
if __name__ == '__main__':
    hotel_name = []
    hotel_price = []
    hotel_address = []
    for i in range(10):
        hotel_name_, hotel_price_, hotel_address_ = get_data(i)
        hotel_name.extend(hotel_name_)
        hotel_price.extend(hotel_price_)
        hotel_address.extend(hotel_address_)
        time.sleep(1)
        print("已完成第"+str(i)+"页爬取")
    dataframe = pd.DataFrame({'酒店名称': hotel_name, '酒店价格': hotel_price, '酒店地址': hotel_address})
    dataframe.to_csv("hotel.csv", index=False, sep=',', encoding="utf_8_sig")


你可能感兴趣的:(算法)