爬取安居客长沙新房的位置、户型、面积等信息。

import requests
import bs4
import time
import random
import pandas as pd
import os

house_info=[]
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
}
# for i in range(1,50):
for i in range(1,5):
    url="https://cs.fang.anjuke.com/loupan/all/p"+str(i)+"/#filtersort"

    print("开始爬取安居客平台长沙新房第%s页信息....." %(str(i)))
    response = requests.get(url=url, headers=headers)
    # response = requests.get(url=url)
    if not os.path.exists('anjukecs/'):
        os.mkdir('anjukecs/')

    with open('anjukecs/page{}.html'.format(i), 'a+', encoding='utf-8') as f:
        # print(driver.page_source.encode('utf-8'))
        f.write(str(response.text))

    #生成bs4对象
    bsoup=bs4.BeautifulSoup(response.text,'lxml')

    house_list=bsoup.find_all('div',class_="infos")

    for house in house_list:
      #bs4解析文件
       titile = house.find('a').text.strip()
       try:
           house_type = house.find('a', class_='huxing').text.replace('\t', '').replace('\n', '').strip()
       except:
           house_type = ''

       try:
           area = house.find('span', class_='building-area').text
       except:
           area = ''

       try:
           address = house.find('a', class_='address').span.text.replace(" ","").strip()
       except:
           address = ''

       pd1= pd.DataFrame({'titile': titile, 'house_type': house_type,
                 'area': area, 'address': address},index=[0])
       house_info.append(pd1)


    second=random.randrange(3,5)
    time.sleep(second)

house_info2=pd.concat(house_info)
house_info2.to_excel('cs_house_info.xlsx',index=False)

爬取安居客长沙新房的位置、户型、面积等信息。

你可能感兴趣的:(爬虫)