安居客房源信息爬取

工作之余所写 现今能用

小白专属,私信可回。


import requests
from bs4 import BeautifulSoup
from loguru import logger
import time

# 构建请求头
headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',

            'Cookie': # 此处放置你自己的cookie信息,否则 不能用,不会的建议百度下。}

# 获取详情页面url
def get_url(base_url):
    resp = requests.get(base_url, headers=headers)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')

    mes_divs = soup.find('section', class_='list').find_all('div', class_='property')
    link_urls = []
    for mes_div in mes_divs:
        link = mes_div.find('a')['href']
        link_urls.append(link)
    return link_urls

# 解析详情页面url
def parse_url(link_url, f):
    resp = requests.get(link_url, headers=headers)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')

    try:
        # 标题
        title = list(soup.find('h1', class_='title').stripped_strings)
        title = ''.join(title)
        print(title)
        # 总价
        price_num = list(soup.find('div', class_='maininfo-price-wrap').stripped_strings)
        price_num = ''.join(price_num)

        # 均价
        price_avg = list(soup.find('div', class_='maininfo-avgprice-price').stripped_strings)
        price_avg = ''.join(price_avg)

        # 房型
        house_model = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-1').stripped_strings)
        house_model = ''.join(house_model)

        # 面积
        house_num = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-2').stripped_strings)
        house_num = ''.join(house_num)

        # 朝向
        house_des = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-3').stripped_strings)
        house_des = ''.join(house_des)

        # 地址
        house_add = list(soup.find('div', class_='maininfo-meta').stripped_strings)
        house_add = ''.join(house_add)

        # 房屋信息
        house_info = list(soup.find('div', class_='houseInfo').stripped_strings)
        house_info = ''.join(house_info).replace('房源信息', '').replace('交易信息', '').replace('物业类型', '\n物业类型: ').replace('产权年限', '\n产权年限: ').replace('参考预算', '\n参考预算: ').replace('房贷计算发布信息', '').replace('发布公司', '\n发布公司: ').replace('发布时间', '\n发布时间: ').replace('营业执照', '\n营业执照: ').replace('官方核验房源', '').replace('核验编码', '\n核验编码: ')

        f.write('{},{},{},{},{}\n'.format(title, price_num, price_avg, house_model, house_num))

        # 反爬机制 避免短时间大量请求 被封ip
        time.sleep(1)
    except:
        # 此处抛出的异常大多为需要 滑动验证页面 暂时不做处理
        # 想法是当发现需滑动验证时 重新加载url 获取数据
        print('==========err')


def start():
    # logger 日志模块
    logger.add("runtime_err.log", rotation="500 MB")
    # 文件名
    time_name = 'ajk_jm_price' + str(time.strftime("%Y-%m-%d", time.localtime())) + '.csv'
    # 防止重复打开关闭文件 造成cpu资源浪费
    with open(time_name, 'a', encoding='utf-8') as f:
        f.write('{},{},{},{},{}\n'.format('标题', '总价', '均价', '房型', '面积'))
        # 仅获取前10页的内容
        for i in range(0,10):
            base_url = 'https://xm.anjuke.com/sale/jimei/p' + str(i) + '/'
            link_urls = get_url(base_url)
            for link_url in link_urls:
                parse_url(link_url, f)

if __name__ == '__main__':
    start()```

你可能感兴趣的:(爬虫,python,python,爬虫)