爱彼迎是用javascripe渲染的,不能使用静态网页获取法,因此调用selenium库
from selenium import webdriver
from bs4 import BeautifulSoup
import bs4
import time
from selenium.webdriver.firefox.options import Options
def getHTMLText(url):
try:
options = Options()//让浏览器不跳出来
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)
driver.get(url)
time.sleep(10)//等待加载
text=driver.page_source//返回网页渲染后的页面信息
print("解析成功")
return text
except:
print("解析失败")
def parsePage(html)://bs4解析网页信息
soup = BeautifulSoup(html,'html.parser')
time.sleep(2)
try:
#爬下名字
NAME = []
name = soup.find_all("div",class_="_qrfr9x5")//找到对应标签后用煲汤法找出对应元素
for i in name:
NAME.append(i.text)//存储起来
#爬下价格
PRICE = []
Price = soup.find_all("div",class_="_1ixtnfc")
for i in Price:
cont = i.find_all('span')
PRICE.append(cont[1].text)
result = []//将两个信息放进同一个数组中
length = len(NAME)
for i in range(length):
result.append([NAME[i],PRICE[i]])
print("信息存储成功")
return result
except:
print("信息存储失败")
def save_to_txt(result):
f = open('price_village5.txt','a+',encoding='utf-8')//追加写入,不覆盖之前的信息
length = len(result)
for i in range(length):
f.write(str(result[i][0])+" "+str(result[i][1])+"\n")
f.close()
def main():
start_url = 'https://www.airbnb.cn/s/深圳/homes?refinement_paths[]=%2Fhomes¤t_tab_id=home_tab&selected_tab_id=home_tab&screen_size=large&hide_dates_and_guests_filters=false&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0&s_tag=6kIFXvRQ§ion_offset=4&items_offset='
depth= 15
for i in range(depth):
try:
url = start_url + str(20*i)
html = getHTMLText(url)
infoList = parsePage(html)
save_to_txt(infoList)
except:
continue
main()