Python链家租房信息爬虫

爬取链家某地区(杭州,南京等)租房信息爬虫。链家只开放了前100页供查看,每页30条,因此实际上只爬取了前3000条信息。

对于项目需求需要分析某地区某段时间内发布的租房信息,爬取对应的名称name,地区dist,面积square,价格price,备注detail,用pandas.DataFrame.to_excel()保存为 excel 文件。并发下载使用futures.ThreadPoolExecutor

针对时间分析,需要得到具体的页面url信息,在详情页内找到对应的时间,并用time = re.sub('[\u4e00-\u9fa5]*', '', time)去掉中文获取时间戳。时间戳的分析使用pd.to_datetime()

Github地址:Joovo/lianjia_spider

代码:

import requests
import pandas as pd
from lxml import etree
import re
from concurrent import futures

root_url = 'https://hz.lianjia.com'
s = requests.session()
header = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
csv = pd.DataFrame(columns=['名称', '地区', '面积', '价格', '备注'])


def crawl(page):
    page = str(page)
    page_url = root_url + '/zufang/pg' + page + '/#contentList'
    r = s.request('GET', headers=header, url=page_url)
    r.encoding = r.apparent_encoding
    tree = etree.HTML(r.text)
    for num in range(1, 31):
        num = str(num)
        name_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[1]/a//text()'
        detail_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[2]//text()'
        detail_url_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/p[1]/a/@href'
        price_xpath = '//*[@id="content"]/div[1]/div[1]/div[' + num + ']/div/span/em//text()'
        # //*[@id="content"]/div[1]/div[1]/div[30]/div/p[1]/a

        detail_url_xpath = tree.xpath(detail_url_xpath)
        detail_r = s.get(headers=header, url=root_url + detail_url_xpath[0])
        # if time illegal
        if not check_time(detail_r):
            continue

        # name
        name = tree.xpath(name_xpath)
        name = name[0].strip('\n').strip('\t').strip()

        # detail
        detail = tree.xpath(detail_xpath)
        detail = [_.strip('\n').strip('\t').strip() for _ in detail]
        detail = ''.join(detail)

        # dist
        if detail.count('/') == 4:
            dist = detail.split('/')[0]
        else:
            dist = ''

        # square
        square = re.search(r'\d+㎡', detail).group()
        square = square[:-1]

        # price
        price = tree.xpath(price_xpath)
        price = price[0]

        new_line = pd.DataFrame([[name, dist, square, price, detail]], \
                                columns=['名称', '地区', '面积', '价格', '备注'])
        global csv
        csv = pd.concat([csv, new_line])
    print(page)


def check_time(r):
    r.encoding = r.apparent_encoding
    tree = etree.HTML(r.text)
    time_xpath = '/html/body/div[3]/div[1]/div[3]/div[1]/text()'
    time = tree.xpath(time_xpath)
    time = [_.strip('\n').strip('\t').strip() for _ in time]
    time = ''.join(time)
    time = re.sub('[\u4e00-\u9fa5]*', '', time)
    date = pd.to_datetime(time)
    if pd.to_datetime('2018-07-01') <= date <= pd.to_datetime('2019-08-01'):
        return True
    else:
        return False


if __name__ == '__main__':
    with futures.ThreadPoolExecutor(max_workers=10) as e:
        e.map(crawl, range(1, 101))
    csv.to_excel('./ans.xls', index=False)

你可能感兴趣的:(※,Python,爬虫)