爬取链家某地区(杭州,南京等)租房信息爬虫。链家只开放了前100页供查看,每页30条,因此实际上只爬取了前3000条信息。
对于项目需求需要分析某地区某段时间内发布的租房信息,爬取对应的名称name
,地区dist
,面积square
,价格price
,备注detail
,用pandas.DataFrame.to_excel()
保存为 excel 文件。并发下载使用futures.ThreadPoolExecutor
。
针对时间分析,需要得到具体的页面url信息,在详情页内找到对应的时间,并用time = re.sub('[\u4e00-\u9fa5]*', '', time)
去掉中文获取时间戳。时间戳的分析使用pd.to_datetime()
。
Github地址:Joovo/lianjia_spider
代码:
import requests
import pandas as pd
from lxml import etree
import re
from concurrent import futures
root_url = 'https://hz.lianjia.com'
s = requests.session()
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
csv = pd.DataFrame(columns=['名称', '地区', '面积', '价格', '备注'])
def crawl(page):
page = str(page)
page_url = root_url + '/zufang/pg' + page + '/#contentList'
r = s.request('GET', headers=header, url=page_url)
r.encoding = r.apparent_encoding
tree = etree.HTML(r.text)
for num in range(1, 31):
num = str(num)
name_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[1]/a//text()'
detail_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[2]//text()'
detail_url_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[1]/a/@href'
price_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/span/em//text()'
# //*[@id="content"]/div[1]/div[1]/div[30]/div/p[1]/a
detail_url_xpath = tree.xpath(detail_url_xpath)
detail_r = s.get(headers=header, url=root_url + detail_url_xpath[0])
# if time illegal
if not check_time(detail_r):
continue
# name
name = tree.xpath(name_xpath)
name = name[0].strip('\n').strip('\t').strip()
# detail
detail = tree.xpath(detail_xpath)
detail = [_.strip('\n').strip('\t').strip() for _ in detail]
detail = ''.join(detail)
# dist
if detail.count('/') == 4:
dist = detail.split('/')[0]
else:
dist = ''
# square
square = re.search(r'\d+㎡', detail).group()
square = square[:-1]
# price
price = tree.xpath(price_xpath)
price = price[0]
new_line = pd.DataFrame([[name, dist, square, price, detail]], \
columns=['名称', '地区', '面积', '价格', '备注'])
global csv
csv = pd.concat([csv, new_line])
print(page)
def check_time(r):
r.encoding = r.apparent_encoding
tree = etree.HTML(r.text)
time_xpath = '/html/body/div[3]/div[1]/div[3]/div[1]/text()'
time = tree.xpath(time_xpath)
time = [_.strip('\n').strip('\t').strip() for _ in time]
time = ''.join(time)
time = re.sub('[\u4e00-\u9fa5]*', '', time)
date = pd.to_datetime(time)
if pd.to_datetime('2018-07-01') <= date <= pd.to_datetime('2019-08-01'):
return True
else:
return False
if __name__ == '__main__':
with futures.ThreadPoolExecutor(max_workers=10) as e:
e.map(crawl, range(1, 101))
csv.to_excel('./ans.xls', index=False)