目的:
爬取爱彼迎深圳所有房源的房主,价格,房源介绍等信息
步骤如下:
step1:获取房源页面url
登陆爱彼迎网站,搜索深圳房源
https://zh.airbnb.com/s/homes?refinement_paths%5B%5D=%2Fhomes&adults=0&children=0&infants=0&toddlers=0&query=%E6%B7%B1%E5%9C%B3&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&allow_override%5B%5D=&s_tag=BoUbRf3d§ion_offset=6&items_offset=36
点击翻页
发现每个页面的url有以下规律
第1页url
https://zh.airbnb.com/api/v2/explore_tabs?version=1.4.5&satori_version=1.1.3&_format=for_explore_search_web&experiences_per_grid=20&items_per_grid=18&guidebooks_per_grid=20&auto_ib=false&fetch_filters=true&has_zero_guest_treatment=true&is_guided_search=true&is_new_cards_experiment=true&luxury_pre_launch=true&query_understanding_enabled=false&show_groupings=true&supports_for_you_v3=true&timezone_offset=480&client_session_id=06baebd5-905c-4af7-9b24-d52b1882eaa9&metadata_only=false&is_standard_search=true&refinement_paths%5B%5D=%2Fhomes&selected_tab_id=home_tab&adults=0&children=0&infants=0&toddlers=0&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&allow_override%5B%5D=&s_tag=BoUbRf3d§ion_offset=6&items_offset=18&last_search_session_id=2c09eb30-1002-4335-8cee-2ccc7701234d&federated_search_session_id=ef96172f-305a-497f-819d-c1337aa72434&screen_size=medium&query=%E6%B7%B1%E5%9C%B3&_intents=p1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh
第2页url
https://zh.airbnb.com/api/v2/explore_tabs?version=1.4.5&satori_version=1.1.3&_format=for_explore_search_web&experiences_per_grid=20&items_per_grid=18&guidebooks_per_grid=20&auto_ib=false&fetch_filters=true&has_zero_guest_treatment=true&is_guided_search=true&is_new_cards_experiment=true&luxury_pre_launch=true&query_understanding_enabled=false&show_groupings=true&supports_for_you_v3=true&timezone_offset=480&client_session_id=06baebd5-905c-4af7-9b24-d52b1882eaa9&metadata_only=false&is_standard_search=true&refinement_paths%5B%5D=%2Fhomes&selected_tab_id=home_tab&adults=0&children=0&infants=0&toddlers=0&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&allow_override%5B%5D=&s_tag=BoUbRf3d§ion_offset=6&items_offset=36&last_search_session_id=92ab000c-cf6b-4983-8c54-bc6a2263094f&federated_search_session_id=ef96172f-305a-497f-819d-c1337aa72434&screen_size=medium&query=%E6%B7%B1%E5%9C%B3&_intents=p1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh
通过多次对比刷新可以得到
使用以下代码可构造每一个页面
HouseNumberUrl="https://zh.airbnb.com/api/v2/explore_tabs?version=1.4.5&satori_version=1.1.3&_format=for_explore_search_web&experiences_per_grid=20&items_per_grid=18&guidebooks_per_grid=20&auto_ib=false&fetch_filters=true&has_zero_guest_treatment=true&is_guided_search=true&is_new_cards_experiment=true&luxury_pre_launch=true&query_understanding_enabled=false&show_groupings=true&supports_for_you_v3=true&timezone_offset=480&client_session_id=53638ce3-becb-444f-bfdd-d6b301b93456&metadata_only=false&is_standard_search=true&refinement_paths%5B%5D=%2Fhomes&selected_tab_id=home_tab&adults=0&children=0&infants=0&toddlers=0&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&allow_override%5B%5D=&s_tag=BoUbRf3d§ion_offset=6&items_offset="+str(i)+"&screen_size=medium&query=%E6%B7%B1%E5%9C%B3&_intents=p1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh"
通过以下代码可以迭代构造出这17个页面
for i in range(18):
step2:获取房源编码代号
获取remarketing_ids,里面每一个id即为这里每个房源url的编号,可用于爬取信息构造url
获取房源编号代码
def getHouseNumber(url):
html = requests.get(url)
data=html.text
#构造正则,获取remarketing_ids
urlbase=re.search('"remarketing_ids":\[(.*?)\],',data)
#将其放入一个列表return回去主函数
url=[]
urlNumber=''
for each in urlbase.group(1):
if each==",":
url.append(urlNumber)
urlNumber=''
else:
urlNumber=urlNumber+each
return url
step3:获取房源信息
由于房源信息比较混乱,因此得多步爬取
点击第一个房源进入
可以看到 房主,房源介绍 位置 房名 须知这五个信息在此url页面
构造url页面方法如下:
Url = 'https://zh.airbnb.com/api/v2/pdp_listing_details/' + str(urlNumber) + '?_format=for_rooms_show&adults=1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
通过以下代码获取上面五个信息
Url = 'https://zh.airbnb.com/api/v2/pdp_listing_details/' + str(urlNumber) + '?_format=for_rooms_show&adults=1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
html=requests.get(Url)
data = json.loads(html.text)
#房主
House_owner=data['pdp_listing_detail']['user'].get('host_name')
#房源介绍
Introduction_housing=data['pdp_listing_detail']['sectioned_description'].get('description')
#位置
Position=data['pdp_listing_detail'].get('location_title')
#房名
HouseName=data['pdp_listing_detail'].get('name')
#须知
buchong1=''
for i,j in enumerate(data['pdp_listing_detail']['guest_controls']['p3_structured_house_rules']):
buchong1 = buchong1+str(i)+','+j+'\n'
buchong2='''预订取消政策\n严格政策——48小时内免费取消 · 限时全额退款\n\n交易提示\n为了保护您的账号隐私和付款安全,请不要相信其它任何平台的折扣或礼金券代订,并始终在爱彼迎站内转账和交流'''
Notice = '房屋守则\n' + data['pdp_listing_detail'].get('additional_house_rules') +'\n\n'+ '基本要求\n' + buchong1+'\n\n'+buchong2
价格获取方法
通过以下进行构造链接
price_url='https://zh.airbnb.com/api/v2/pdp_listing_booking_details?force_boost_unc_priority_message_type=&guests=1&listing_id='+ str(urlNumber) +'&show_smart_promotion=0&_format=for_web_dateless&_interaction_type=pageload&_intents=p3_book_it&_parent_request_uuid=4527592d-6c3c-4b64-9c40-b814fb4ca733&_p3_impression_id=p3_1547785606_MzQbhXiFnlGGppGx&number_of_adults=1&number_of_children=0&number_of_infants=0&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh'
通过以下代码获取价格信息
#价格
price_url='https://zh.airbnb.com/api/v2/pdp_listing_booking_details?force_boost_unc_priority_message_type=&guests=1&listing_id='+ str(urlNumber) +'&show_smart_promotion=0&_format=for_web_dateless&_interaction_type=pageload&_intents=p3_book_it&_parent_request_uuid=4527592d-6c3c-4b64-9c40-b814fb4ca733&_p3_impression_id=p3_1547785606_MzQbhXiFnlGGppGx&number_of_adults=1&number_of_children=0&number_of_infants=0&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh'
price_html = requests.get(price_url)
price_data = json.loads(price_html.text)
price=price_data['pdp_listing_booking_details'][0]['rate_with_service_fee'].get('amount_formatted')
#房源介绍
Introduction_housing=data['pdp_listing_detail']['sectioned_description'].get('description')
获取所有评论
发现评论每页只显示7条,在下面这个json文件
通过以下进行构造链接
CommenUrl = 'https://zh.airbnb.com/api/v2/reviews?key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh&listing_id='+str(urlNumber)+'&role=guest&_format=for_p3&_limit=7&_offset=' + str(each * 7) + '&_order=language_country'
因此我们需要迭代获取所有评论
使用以下代码可以获取所有评论
Commen = ''
for each in range(100):
CommenUrl = 'https://zh.airbnb.com/api/v2/reviews?key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh&listing_id='+str(urlNumber)+'&role=guest&_format=for_p3&_limit=7&_offset=' + str(each * 7) + '&_order=language_country'
CommenHtml=requests.get(CommenUrl)
if CommenHtml.status_code==200:
Commendata = json.loads(CommenHtml.text)
for i in Commendata['reviews']:
Commen = Commen + i.get('comments') + '\n'
else:
break
获取完全部信息后,将其保存进一个列表
ALL_Information=[]
ALL_Information.append(HouseName)
ALL_Information.append(House_owner)
ALL_Information.append(price)
ALL_Information.append(Introduction_housing)
ALL_Information.append(Position)
ALL_Information.append(Notice)
ALL_Information.append(Commen)
return ALL_Information
完整代码如下:
# -*- coding: utf-8 -*-
import requests,re,json
import csv
def getHouseNumber(url):
html = requests.get(url)
data=html.text
#构造正则,获取remarketing_ids
urlbase=re.search('"remarketing_ids":\[(.*?)\],',data)
#将其放入一个列表return回去主函数
url=[]
urlNumber=''
for each in urlbase.group(1):
if each==",":
url.append(urlNumber)
urlNumber=''
else:
urlNumber=urlNumber+each
return url
def getHouseInformation(urlNumber):
Url = 'https://zh.airbnb.com/api/v2/pdp_listing_details/' + str(urlNumber) + '?_format=for_rooms_show&adults=1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
html=requests.get(Url)
data = json.loads(html.text)
#房主
House_owner=data['pdp_listing_detail']['user'].get('host_name')
#价格
price_url='https://zh.airbnb.com/api/v2/pdp_listing_booking_details?force_boost_unc_priority_message_type=&guests=1&listing_id='+ str(urlNumber) +'&show_smart_promotion=0&_format=for_web_dateless&_interaction_type=pageload&_intents=p3_book_it&_parent_request_uuid=4527592d-6c3c-4b64-9c40-b814fb4ca733&_p3_impression_id=p3_1547785606_MzQbhXiFnlGGppGx&number_of_adults=1&number_of_children=0&number_of_infants=0&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh'
price_html = requests.get(price_url)
price_data = json.loads(price_html.text)
price=price_data['pdp_listing_booking_details'][0]['rate_with_service_fee'].get('amount_formatted')
#房源介绍
Introduction_housing=data['pdp_listing_detail']['sectioned_description'].get('description')
#位置
Position=data['pdp_listing_detail'].get('location_title')
#房名
HouseName=data['pdp_listing_detail'].get('name')
#须知
buchong1=''
for i,j in enumerate(data['pdp_listing_detail']['guest_controls']['p3_structured_house_rules']):
buchong1 = buchong1+str(i)+','+j+'\n'
buchong2='''预订取消政策\n严格政策——48小时内免费取消 · 限时全额退款\n\n交易提示\n为了保护您的账号隐私和付款安全,请不要相信其它任何平台的折扣或礼金券代订,并始终在爱彼迎站内转账和交流'''
Notice = '房屋守则\n' + data['pdp_listing_detail'].get('additional_house_rules') +'\n\n'+ '基本要求\n' + buchong1+'\n\n'+buchong2
# 评论
Commen = ''
for each in range(100):
CommenUrl = 'https://zh.airbnb.com/api/v2/reviews?key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh&listing_id='+str(urlNumber)+'&role=guest&_format=for_p3&_limit=7&_offset=' + str(each * 7) + '&_order=language_country'
CommenHtml=requests.get(CommenUrl)
if CommenHtml.status_code==200:
Commendata = json.loads(CommenHtml.text)
for i in Commendata['reviews']:
Commen = Commen + i.get('comments') + '\n'
else:
break
ALL_Information=[]
ALL_Information.append(HouseName)
ALL_Information.append(House_owner)
ALL_Information.append(price)
ALL_Information.append(Introduction_housing)
ALL_Information.append(Position)
ALL_Information.append(Notice)
ALL_Information.append(Commen)
return ALL_Information
def main(i):
HouseNumberUrl="https://zh.airbnb.com/api/v2/explore_tabs?version=1.4.5&satori_version=1.1.3&_format=for_explore_search_web&experiences_per_grid=20&items_per_grid=18&guidebooks_per_grid=20&auto_ib=false&fetch_filters=true&has_zero_guest_treatment=true&is_guided_search=true&is_new_cards_experiment=true&luxury_pre_launch=true&query_understanding_enabled=false&show_groupings=true&supports_for_you_v3=true&timezone_offset=480&client_session_id=53638ce3-becb-444f-bfdd-d6b301b93456&metadata_only=false&is_standard_search=true&refinement_paths%5B%5D=%2Fhomes&selected_tab_id=home_tab&adults=0&children=0&infants=0&toddlers=0&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&allow_override%5B%5D=&s_tag=BoUbRf3d§ion_offset=6&items_offset="+str(7*i)+"&screen_size=medium&query=%E6%B7%B1%E5%9C%B3&_intents=p1&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=CNY&locale=zh"
out = open('Stu_csv.csv', 'a', newline='', encoding='utf-8-sig')
csv_write = csv.writer(out, dialect='excel')
for number in getHouseNumber(HouseNumberUrl):
stu2=getHouseInformation(number)
csv_write.writerow(stu2)
#每爬取一个完整的房源信息,就输出一次write over
print("write over")
if __name__ == '__main__':
stu1 = ['房名', '房主', '价格', '房源介绍', '位置', '须知', '所有评论']
out = open('Stu_csv.csv', 'a', newline='', encoding='utf-8-sig')
csv_write = csv.writer(out, dialect='excel')
csv_write.writerow(stu1)
for i in range(18):
main(i)