爬取安居客,获取杭州萧山二手房
获取链接,小区,地址,二手房,价格,保存到csv表格中
代码如下:
# -*- coding: utf-8 -*-
import csv
import time
import requests
import io
import sys
from lxml import etree
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
def anju():
#请求头内容
headers={
'authority': 'hangzhou.anjuke.com',
'method': 'GET',
'path': '/community/xiaoshan/?tdsourcetag=s_pctim_aiomsg',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': '自己cookie',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': '本地请求头'
}
#循环页数
for i in range(1,25):
res=requests.get(f'https://hangzhou.anjuke.com/community/xiaoshan/p{i}/',headers=headers)
#传递参数
pares(res.text)
#每爬一次,暂停时间
time.sleep(12)
def pares(html):
root=etree.HTML(html)
#文件操作
with open("杭州萧山二手房.csv", 'a')as f:
writer = csv.writer(f)
writer.writerow(['链接','小区','地址','房源信息','价格'])
#爬取内容
for i in range(2, 31):
#防止报错
try:
url = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/h3/a/@href')
url = ''.join(url)
name = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/h3/a/text()')
name = ''.join(name)
site = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/address/text()')
site = ''.join(site)
house = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/p[2]/span/text()')
house = ''.join(house)
price = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[2]/p[1]/strong/text()')
price = ''.join(price)
price = price+'元/每平米'
except:
pass
try:
#加入表格
writer.writerow([url,name,site,house,price])
#错误加入0
except:
writer.writerow([0,0,0,0,0])
print(url, name, site, house, price)
if __name__=='__main__':
anju()