同城艺龙的反爬做的是非常好的,本博主在与同城艺龙进行了一整天的殊死搏斗才将其完全的爬下来,本博主是以无锡为例,将无锡的所有酒店的相关信息都爬了下来,共3399条酒店数据,当然其他城市也是可以的,只需要修改指定字段即可。本博主是先将数据存储到MongoDB中然后再将数据转存到exlce中,以下是我爬取的数据截图
注:代码有大量注释 注释中有本博主在爬取数据以及对数据的相关处理手段
import json
import time
from lxml import html
import requests
import re
from pymongo import MongoClient
class YiLongSpider(object):
def __init__(self):
# 列表页url
self.list_url_temp = 'http://hotel.elong.com/ajax/tmapilist/asyncsearch'
# 详情页url
self.detail_url_temp = 'http://hotel.elong.com/{}/?issugtrace=2'
# 酒店价格url
self.price_url_temp = 'http://hotel.elong.com/ajax/tmapidetail/gethotelroomsetjvajson'
# 构造请求列表页响应体
self.list_dat = {
"code":"8013851",
"listRequest.areaID":"",
"listRequest.bedLargeTypes":"",
"listRequest.bookingChannel":"1",
"listRequest.breakfasts":"0",
"listRequest.cancelFree":"false",
"listRequest.cardNo":"240000001103487989", # 此参数要与detail_dat的cardNo参数保持一致
"listRequest.checkInDate":"2021-02-11 00:00:00", # 参数需要替换成当天日期
"listRequest.checkOutDate":"2021-02-12 00:00:00", # 参数需要替换成当天日期的后一天
"listRequest.cityID":"1105",
"listRequest.cityName":"无锡",
"listRequest.crawledFlag":"0",
"listRequest.customLevel":"11",
"listRequest.discountIds":"",
"listRequest.distance":"20000",
"listRequest.endLat":"0",
"listRequest.endLng":"0",
"listRequest.epcCreateOrderGuideVersion":"Z",
"listRequest.facilityIds":"",
"listRequest.guokaoFlag":"false",
"listRequest.highPrice":"0",
"listRequest.hotelBrandIDs":"",
"listRequest.hotelIDs":"",
"listRequest.interceptAction":"0",
"listRequest.isAdvanceSave":"false",
"listRequest.isAfterCouponPrice":"true",
"listRequest.isCoupon":"false",
"listRequest.isDebug":"false",
"listRequest.isLimitTime":"false",
"listRequest.isLogin":"false",
"listRequest.isMobileOnly":"true",
"listRequest.isNeed5Discount":"true",
"listRequest.isNeedNotContractedHotel":"false",
"listRequest.isNeedSimilarPrice":"false",
"listRequest.isReturnNoRoomHotel":"true",
"listRequest.isStaySave":"false",
"listRequest.isTrace":"false",
"listRequest.isUnionSite":"false",
"listRequest.isnstantConfirm":"false",
"listRequest.keywords":"",
"listRequest.keywordsType":"0",
"listRequest.language":"cn",
"listRequest.lat":"31.497648",
"listRequest.listType":"0",
"listRequest.lng":"120.316954",
"listRequest.lowPrice":"0",
"listRequest.orderFromID":"1003",
"listRequest.pageIndex":"1",
"listRequest.pageSize":"20",
"listRequest.payMethod":"0",
"listRequest.personOfRoom":"0",
"listRequest.poiId":"0",
"listRequest.poiName":"",
"listRequest.productTypes":"1, 6, 26",
"listRequest.promotionChannelCode":"0000",
"listRequest.promotionSwitch":"-1",
"listRequest.proxyID":"ZD",
"listRequest.rankType":"0",
"listRequest.returnFilterItem":"true",
"listRequest.sectionId":"",
"listRequest.sellChannel":"1",
"listRequest.seoHotelStar":"0",
"listRequest.sortDirection":"1",
"listRequest.sortMethod":"1",
"listRequest.standBack":"-1",
"listRequest.starLevels":"",
"listRequest.startLat":"0",
"listRequest.startLng":"0",
"listRequest.sug_act_info":"",
"listRequest.taRecommend":"false",
"listRequest.themeIds":"",
"listRequest.traceId":" de063359-ec4e-4c9a-b6e4-381e38079a4d",
"listRequest.wordId":"",
"listRequest.wordType":"-1",
"listRequest.elongToken": "8340df84-1a62-4c32-8655-0d9f55894610",
"listRequest.trace_token":"|*|cityId:1105|*|qId:41b5a7eb-3c68-4222-b17b-ae94b94c5b17|*|st:city|*|sId:1105|*|",
}
# 构造请求详情页响应体 hotelIDs为每一个酒店的编号
self.detail_dat = {
"bookingChannel": "1",
"cardNo": "240000001103487989", # 此参数要与list_dat的cardNo参数保持一致
"cheapestPriceFlag": "false",
"checkInDate": "2021-02-11", # 参数需要替换成当天日期
"checkOutDate": "2021-02-12", # 参数需要替换成当天日期的后一天
"crawledFlag": "0",
"customerLevel": "11",
"hotelIDs": "01105111",
"interceptAction": "0",
"isAfterCouponPrice": "true",
"isDebug": "false",
"isLogin": "false",
"isMobileOnly": "false",
"isNeed5Discount": "false",
"isTrace": "false",
"language": "cn",
"needDataFromCache": "true",
"needPromotion": "true",
"orderFromID": "1003",
"payMethod": "0",
"productType": "0",
"promotionChannelCode": "0000",
"proxyID": "ZD",
"sellChannel": "1",
"settlementType": "0",
"updateOrder": "false",
"elongToken": "8340df84-1a62-4c32-8655-0d9f55894610",
"code": "8965113",
}
# 由于列表页url请求头和详情页面url请求头不同 因此需要分别构造请求头
# 构造列表响应头
self.list_headers = {
# Cookie更换成自己的Cookie值
'Cookie': 'CookieGuid=8340df84-1a62-4c32-8655-0d9f55894610; H5CookieId=8da60226-1108-4c30-8023-c275b40d9b1b; firsttime=1612921642621; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23%401105%23%E6%97%A0%E9%94%A1%23wuxi%23; _fid=8340df84-1a62-4c32-8655-0d9f55894610; SHBrowseHotel=cn=94074245%2C%2C%2C%2C%2C%2C%3B41105032%2C%2C%2C%2C%2C%2C%3B01105111%2C%2C%2C%2C%2C%2C%3B93952280%2C%2C%2C%2C%2C%2C%3B93030245%2C%2C%2C%2C%2C%2C%3B&; SessionGuid=c84db62e-22b2-43eb-9ea3-fcae7710478d; Esid=d4899e3e-c273-4a9c-a2d8-a0a67bfe2a64; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=5&Parentid=1500&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1003&Priority=9001&Isusefparam=0&Makecomefrom=1&Savecookies=0; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; __tctma=20377580.1612921545984895.1612921545736.1612942042962.1613021168874.3; __tctmc=20377580.26050747; __tctrack=0; __tctmu=20377580.0.0; __tctmz=20377580.1613021168874.3.1.utmccn=(organic)|utmcmd=organic|utmEsl=utf-8|utmcsr=baidu|utmctr=%e5%90%8c%e5%9f%8e%e8%89%ba%e9%be%99; longKey=1612921545984895; H5SessionId=3F4F4F55BB91AD9A6E6BA459D77ED4F0; H5Channel=mnoreferseo%2CSEO; _tcudid_v2=s6-KLlePnKSY5k_e6c8afJ2siIdL6JckHCbGJGn8gZA; SessionToken=84bf644c-c0e0-40ab-9bfb-2e3399638e1a622; Lgid=LRpRtrsC3gsExwGXhEk%2FlpaR3waA7McUH7SGryL5%2FSFy7cgKYHvIl7%2BU1ESQ4xjsnKNiBZUvDBjyD47c0BiVo%2BXmTwHlATGgovz4PU04xC8ATdAK4QXdw1ep8TF%2FiKzVyBWhvVjW%2FtZZDQjF7yOBLQ%3D%3D; tcUser=%7B%22AccessToken%22%3A%22C9B2CD22F9D2F4EB07D6A7EF98A6A971%22%2C%22MemberId%22%3A%221e70de0b8ccf9bf972cb961495a4a42a%22%7D; __tctmd=20377580.737325; __tctmb=20377580.1983279497417116.1613021168874.1613021220438.2; s_sq=%5B%5BB%5D%5D; User-Ref-SessionId=2efe-4f4e-b64e-4b7f-1fea-3649; businessLine=hotel; anti_token=27513B19-FAE2-42B3-A54E-02D845BBBD37; __tctmb=0.2350751285017907.1613021224930.1613021224930.1; __tccgd=0.0; __tctmc=0.6528555; __tctmd=0.252662736; ShHotel=InDate=2021-02-11&CityID=1105&CityNameEN=wuxi&CityNameCN=%E6%97%A0%E9%94%A1&OutDate=2021-02-12&CityName=%E6%97%A0%E9%94%A1; trace_extend={"deviceid":"8340df84-1a62-4c32-8655-0d9f55894610","appid":"6","userid":"240000001103487989","orderfromid":1003,"sessionid":"2efe-4f4e-b64e-4b7f-1fea-3649","pvid":"59adeb5b"}; JSESSIONID=FD9391A57926778C3467896D8DED5039; lasttime=1613021257330',
"Accept": "application/json, text/javascript, */*; q=0.01",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# 构造详情响应头
self.detail_headers = {
# Cookie更换成自己的Cookie值
'Cookie': 'CookieGuid=8340df84-1a62-4c32-8655-0d9f55894610; H5CookieId=8da60226-1108-4c30-8023-c275b40d9b1b; firsttime=1612921642621; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23%401105%23%E6%97%A0%E9%94%A1%23wuxi%23; _fid=8340df84-1a62-4c32-8655-0d9f55894610; SHBrowseHotel=cn=94074245%2C%2C%2C%2C%2C%2C%3B41105032%2C%2C%2C%2C%2C%2C%3B01105111%2C%2C%2C%2C%2C%2C%3B93952280%2C%2C%2C%2C%2C%2C%3B93030245%2C%2C%2C%2C%2C%2C%3B&; SessionGuid=c84db62e-22b2-43eb-9ea3-fcae7710478d; Esid=d4899e3e-c273-4a9c-a2d8-a0a67bfe2a64; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=5&Parentid=1500&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1003&Priority=9001&Isusefparam=0&Makecomefrom=1&Savecookies=0; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; __tctma=20377580.1612921545984895.1612921545736.1612942042962.1613021168874.3; __tctmc=20377580.26050747; __tctrack=0; __tctmu=20377580.0.0; __tctmz=20377580.1613021168874.3.1.utmccn=(organic)|utmcmd=organic|utmEsl=utf-8|utmcsr=baidu|utmctr=%e5%90%8c%e5%9f%8e%e8%89%ba%e9%be%99; longKey=1612921545984895; H5SessionId=3F4F4F55BB91AD9A6E6BA459D77ED4F0; H5Channel=mnoreferseo%2CSEO; _tcudid_v2=s6-KLlePnKSY5k_e6c8afJ2siIdL6JckHCbGJGn8gZA; SessionToken=84bf644c-c0e0-40ab-9bfb-2e3399638e1a622; Lgid=LRpRtrsC3gsExwGXhEk%2FlpaR3waA7McUH7SGryL5%2FSFy7cgKYHvIl7%2BU1ESQ4xjsnKNiBZUvDBjyD47c0BiVo%2BXmTwHlATGgovz4PU04xC8ATdAK4QXdw1ep8TF%2FiKzVyBWhvVjW%2FtZZDQjF7yOBLQ%3D%3D; tcUser=%7B%22AccessToken%22%3A%22C9B2CD22F9D2F4EB07D6A7EF98A6A971%22%2C%22MemberId%22%3A%221e70de0b8ccf9bf972cb961495a4a42a%22%7D; __tctmd=20377580.737325; __tctmb=20377580.1983279497417116.1613021168874.1613021220438.2; s_sq=%5B%5BB%5D%5D; User-Ref-SessionId=2efe-4f4e-b64e-4b7f-1fea-3649; businessLine=hotel; anti_token=27513B19-FAE2-42B3-A54E-02D845BBBD37; __tctmb=0.2350751285017907.1613021224930.1613021224930.1; __tccgd=0.0; __tctmc=0.6528555; __tctmd=0.252662736; ShHotel=InDate=2021-02-11&CityID=1105&CityNameEN=wuxi&CityNameCN=%E6%97%A0%E9%94%A1&OutDate=2021-02-12&CityName=%E6%97%A0%E9%94%A1; trace_extend={"deviceid":"8340df84-1a62-4c32-8655-0d9f55894610","appid":"6","userid":"240000001103487989","orderfromid":1003,"sessionid":"2efe-4f4e-b64e-4b7f-1fea-3649","pvid":"59adeb5b"}; JSESSIONID=FD9391A57926778C3467896D8DED5039; lasttime=1613021257330',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Host": "hotel.elong.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
# 初始化MongoDB数据库
self.client = MongoClient()
self.collection = self.client['test']['yilong_hotel']
# 获取总页码数
def getPageIndex(self,count):
# 已知每页共有20条酒店信息
page_num = count//20+1 if count%20>0 else count//20
return [i for i in range(1,page_num+1)]
# 发送url请求
def parse(self,page_index):
time.sleep(0.5)
self.list_dat['listRequest.pageIndex']=page_index
resp = requests.post(self.list_url_temp,data=self.list_dat,headers=self.list_headers)
return resp.content.decode()
# 解析并获取数据
def get_content_list(self,str_html):
json_html = json.loads(str_html)
# 通过正则表达式获取酒店的编号
hotel_content = json_html['value']['hotelListHtml']
hotel_code_list = re.findall(r'data-link=\"/(\d+)/\"',hotel_content)[::2]
hotel_price = re.findall(r'\"h_pri_num.*\">(\d+)<',hotel_content)
hotel_price_len = len(hotel_price)
print(hotel_price_len)
# 遍历编号
for i in range(len(hotel_code_list)):
item = {
}
item['hotel_code'] = hotel_code_list[i]
# 由于在用正则匹配的时候会出现 数量不对应的情况 一下是对酒店价格数据的不对应进行处理
# 当酒店价格数据的长度不足20的时候 就用20-酒店价格数据的长度
# 通过for循环在数组末尾追加0 直到酒店价格数据的长度达到20位置
# 如果都不满足 则直接填充0即可
if hotel_price_len==20:
item['hotel_price'] = hotel_price[i]
elif hotel_price_len<20 and hotel_price_len>0:
for j in range(20-hotel_price_len):
hotel_price.append('0')
item['hotel_price'] = hotel_price[i]
else:
item['hotel_price'] = '0'
# item = self.get_price(item['hotel_code'],item)
self.parse_detail(item['hotel_code'],item)
# 解析详情页数据
def parse_detail(self,hotel_code,item):
time.sleep(0.2)
resp = requests.get(self.detail_url_temp.format(hotel_code),headers = self.detail_headers)
str_html = html.etree.HTML(resp.content.decode())
# 获取酒店的名称、评分、评论数、好评数以及差评数
# 以下多采用三元运算 提高代码健壮性 就算没有获取到数据 也不会报错
hotel_name = str_html.xpath('//div[contains(@class,"hdetail_main")]//h1/text()')
item['hotel_name'] = hotel_name[0] if len(hotel_name)>0 else None
hotel_rate = str_html.xpath('//span[@class="comt_nmb"]/text()')
item['hotel_rate'] = hotel_rate[0] if len(hotel_rate) > 0 else None
hotel_area = str_html.xpath('//span[@data-downtownname]/@data-downtownname')
item['hotel_area'] = hotel_area[0] if len(hotel_area) > 0 else None
hotel_addr = str_html.xpath('//span[@data-downtownname]/text()')
item['hotel_addr'] = hotel_addr if len(hotel_addr) > 0 else None
# 由于同程艺龙的详情页展示会发生变化 因此需要对酒店的服务信息进行两种书写方式
hotel_service_list = str_html.xpath('//span[contains(@class,"icon_faci")]/@title')
item['hotel_service_list'] = hotel_service_list if len(hotel_service_list) > 0 else str_html.xpath('//div[@class="facilities"]/ul/li[not(@class="grey")]/span/text()')
hotel_comment_num = str_html.xpath('//ul[@class="nav_lst"]/li[1]/text()')
item['comment_num'] = hotel_comment_num[0].strip() if len(hotel_comment_num) > 0 else None
hotel_good_comment_num = str_html.xpath('//ul[@class="nav_lst"]/li[2]/text()')
item['good_comment_num'] = hotel_good_comment_num[0].strip() if len(hotel_good_comment_num) > 0 else None
hotel_bad_comment_num = str_html.xpath('//ul[@class="nav_lst"]/li[3]/text()')
item['bad_comment_num'] = hotel_bad_comment_num[0].strip() if len(hotel_bad_comment_num) > 0 else None
print(item)
self.save(item)
# 保存数据
def save(self,item):
self.collection.insert(item)
# 主函数
def run(self):
# 首先请求一次获取总的酒店数量
resp = requests.post(self.list_url_temp,data=self.list_dat,headers = self.list_headers)
json_html = json.loads(resp.content.decode())
# 获取总页数
hotel_count = json_html['value']['hotelCount']
# 调用getPageIndex获取总页码数
page_list = self.getPageIndex(hotel_count)
for i in page_list:
str_html = self.parse(i)
self.get_content_list(str_html)
if __name__ == '__main__':
yilong = YiLongSpider()
yilong.run()
import pandas as pd
from pymongo import MongoClient
import numpy as np
def export_excel(export):
# 将字典列表转换为DataFrame
df = pd.DataFrame(list(export))
# 指定字段顺序
order = ['hotel_code','hotel_price','hotel_area','hotel_name',
'hotel_rate','hotel_addr','hotel_service_list','comment_num','good_comment_num','bad_comment_num']
df = df[order]
# 由于hotel_service_list字段数据为列表数据 通过'-'来连接成字符串数据方便后期数据分析处理
df['hotel_service_list'] = df['hotel_service_list'].apply(lambda x:'-'.join(x) if x is not None else x)
# 将列名替换为中文
columns_map = {
'hotel_code':'酒店编号',
'hotel_price':'酒店价格',
'hotel_area':'酒店区域',
'hotel_name':'酒店名称',
'hotel_rate':'酒店评分',
'hotel_addr':'酒店地址',
'hotel_service_list':'酒店服务',
'comment_num':'酒店评论数量',
'good_comment_num':'酒店好评数',
'bad_comment_num':'酒店差评数',
}
df.rename(columns=columns_map, inplace=True)
# 指定生成的Excel表格名称
file_path = pd.ExcelWriter('yilong_hotel.xlsx')
# 替换空单元格
df.fillna(np.nan, inplace=True)
# 输出
df.to_excel(file_path, encoding='utf-8', index=False)
# 保存表格
file_path.save()
if __name__ == '__main__':
client = MongoClient()
connection = client['test']['yilong_hotel']
ret = connection.find({
}, {
'_id': 0})
data_list = list(ret)
export_excel(data_list)
提醒:
以上就是本博主在爬取同城艺龙的全过程啦!
如有疑问,下方评论。