小白专属,私信可回。
import requests
from bs4 import BeautifulSoup
from loguru import logger
import time
# 构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'Cookie': # 此处放置你自己的cookie信息,否则 不能用,不会的建议百度下。}
# 获取详情页面url
def get_url(base_url):
resp = requests.get(base_url, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
mes_divs = soup.find('section', class_='list').find_all('div', class_='property')
link_urls = []
for mes_div in mes_divs:
link = mes_div.find('a')['href']
link_urls.append(link)
return link_urls
# 解析详情页面url
def parse_url(link_url, f):
resp = requests.get(link_url, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
try:
# 标题
title = list(soup.find('h1', class_='title').stripped_strings)
title = ''.join(title)
print(title)
# 总价
price_num = list(soup.find('div', class_='maininfo-price-wrap').stripped_strings)
price_num = ''.join(price_num)
# 均价
price_avg = list(soup.find('div', class_='maininfo-avgprice-price').stripped_strings)
price_avg = ''.join(price_avg)
# 房型
house_model = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-1').stripped_strings)
house_model = ''.join(house_model)
# 面积
house_num = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-2').stripped_strings)
house_num = ''.join(house_num)
# 朝向
house_des = list(soup.find('div', class_='maininfo-model-item maininfo-model-item-3').stripped_strings)
house_des = ''.join(house_des)
# 地址
house_add = list(soup.find('div', class_='maininfo-meta').stripped_strings)
house_add = ''.join(house_add)
# 房屋信息
house_info = list(soup.find('div', class_='houseInfo').stripped_strings)
house_info = ''.join(house_info).replace('房源信息', '').replace('交易信息', '').replace('物业类型', '\n物业类型: ').replace('产权年限', '\n产权年限: ').replace('参考预算', '\n参考预算: ').replace('房贷计算发布信息', '').replace('发布公司', '\n发布公司: ').replace('发布时间', '\n发布时间: ').replace('营业执照', '\n营业执照: ').replace('官方核验房源', '').replace('核验编码', '\n核验编码: ')
f.write('{},{},{},{},{}\n'.format(title, price_num, price_avg, house_model, house_num))
# 反爬机制 避免短时间大量请求 被封ip
time.sleep(1)
except:
# 此处抛出的异常大多为需要 滑动验证页面 暂时不做处理
# 想法是当发现需滑动验证时 重新加载url 获取数据
print('==========err')
def start():
# logger 日志模块
logger.add("runtime_err.log", rotation="500 MB")
# 文件名
time_name = 'ajk_jm_price' + str(time.strftime("%Y-%m-%d", time.localtime())) + '.csv'
# 防止重复打开关闭文件 造成cpu资源浪费
with open(time_name, 'a', encoding='utf-8') as f:
f.write('{},{},{},{},{}\n'.format('标题', '总价', '均价', '房型', '面积'))
# 仅获取前10页的内容
for i in range(0,10):
base_url = 'https://xm.anjuke.com/sale/jimei/p' + str(i) + '/'
link_urls = get_url(base_url)
for link_url in link_urls:
parse_url(link_url, f)
if __name__ == '__main__':
start()```