pycharm
'''
网站:链家网 网址:https://sh.lianjia.com/ershoufang/pudong/pg2/
用xpath做一个简单的爬虫,爬取链家网里的租房信息获取标题,位置,房屋的格局(三室一厅),关注人数,单价,总价
'''
# 用xpath,需要用到lxml,利用etree.HTML,将字符串转化为Element对象,Element对象具有xpath的方法,返回结果的列表,能够接受bytes类型的数据和str类型的数据
import requests
from lxml import etree
import re
import json
house = list()
url = 'https://sh.lianjia.com/ershoufang/pudong/pg2/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
response = requests.get(url,headers=headers).text
# print(response)
html = etree.HTML(response)
# print(type(html)) #
lis = html.xpath('//ul[@class="sellListContent"]/li')
print(lis)
for li in lis:
item = dict()
item['title'] = li.xpath('.//div[@class="info clear"]/div[@class="title"]/a/text()')[0].strip()
item['position'] = li.xpath('.//div[@class="info clear"]/div[@class="flood"]//a[1]/text()')[0].strip()
item['address'] = li.xpath('.//div[@class="info clear"]/div[@class="address"]//text()')[0].split(' | ')[0].strip()
item['person'] = \
re.match('\d*',li.xpath('.//div[@class="info clear"]/div[@class="followInfo"]//text()')[0].split(' / ')[0]).group()
item['unitPrice'] = \
li.xpath('.//div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]//text()')[0].strip()
item['totalPrice'] = \
li.xpath('.//div[@class="info clear"]/div[@class="priceInfo"]'
'/div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()
house.append(item)
with open("house.json",'w',encoding='utf-8') as f:
json.dump(house,f,ensure_ascii=False,indent=2)
见资源