爬取链家网房价数据

感觉最近做的东西好菜~~随便了。
爬取链家网房价数据_第1张图片

import requests
from lxml import etree
import csv

headers = {
    'Referer': 'https://zs.fang.lianjia.com/loupan/nht1pg1/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

fp = open('D://链家房价数据.csv','wt',newline='',encoding='utf8')
writer = csv.writer(fp)
writer.writerow(('楼盘名', '地址', '房间格式', '房间面积', '价格', '起价', '优点'))

def get_html(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.content.decode('utf8')
        else:
            print('1')
            return None

    except:
        print('2')
        return None


def get_info(html):
    selector = etree.HTML(html)
    li_list = selector.xpath(
        '//li[contains(@class, "resblock-list")]/div[@class="resblock-desc-wrapper"]')
    for li in li_list:
        try:
            name = li.xpath(
                "div[@class='resblock-name']/a[@class='name ']/text()")[0]
            adress_1 = li.xpath(
                "div[@class='resblock-location']/span[1]/text()")[0]
            adress_2 = li.xpath(
                "div[@class='resblock-location']/span[2]/text()")[0]
            adress_3 = li.xpath("div[@class='resblock-location']/a/text()")[0]
            adress = adress_1 + '/' + adress_2 + '/' + adress_3
            how_many_1 = li.xpath("a[@class='resblock-room']/span[1]/text()")[0]
            how_many_2 = li.xpath("a[@class='resblock-room']/span[2]/text()")
            if how_many_2:
                how_many_1 = how_many_1 + '/' + how_many_2[0]
            else:
                pass
            minaji = li.xpath("div[@class='resblock-area']/span/text()")[0]
            price = li.xpath(
                "div[@class='resblock-price']/div[@class='main-price']/span[@class='number']/text()")[0]
            price += '元/平(均价)'
            qijia = li.xpath(
                "div[@class='resblock-price']/div[@class='second']/text()")[0]
            advantge = li.xpath("div[@class='resblock-tag']//text()")
            mylist = []
            for i in advantge:
                j = i.strip()
                if len(j) == 0:
                    continue
                else:
                    mylist.append(j)
            real_advantge = ','.join(mylist)
            x = [name, adress, how_many_1, minaji, price, qijia, real_advantge]
            print(x)
            writer.writerow(x)
        except:
            pass

if __name__ == '__main__':
    urls = ['https://zs.fang.lianjia.com/loupan/nht1pg{}/'.format(i) for i in range(1,19)]
    for url in urls:
        html = get_html(url)
        get_info(html)
        

结果
爬取链家网房价数据_第2张图片

你可能感兴趣的:(爬虫)