爬取链家租房数据,数据处理,进行可视化分析

 lianjiaspider.py

import asyncio
import aiohttp
import pandas as pd
from lxml import etree


class LianjiaSpider(object):

    def __init__(self):
        self._headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
        self._data = list()

    async def get(self, url):
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, headers=self._headers, timeout=3) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        return result
            except Exception as e:
                print(e.args)

    async def parse_html(self):
        for page in range(1, 77):
            url = "https://sjz.lianjia.com/zufang/pg{}/".format(page)
            print("正在爬取{}".format(url))
            html = await self.get(url)  # 获取网页内容
            html = etree.HTML(html)
            self.parse_page(html)
            print("正在存储数据....")
            data = pd.DataFrame(self._data)
            data.to_csv("lianjia.csv", encoding='utf_8_sig')  # 写入文件

    def parse_page(self, html):
        info_panel = html.xpath("//div[@class='info-panel']")
        for info in info_panel:
            region = info.xpath(".//span[@class='region']/text()")
            zone = info.xpath(".//span[@class='zone']/span/text()")
            meters = info.xpath(".//span[@class='meters']/text()")
            where = info.xpath(".//div[@class='where']/span[4]/text()")

            con = info.xpath(".//div[@class='con']/text()")
            floor = con[0]  # 楼层
            type = con[1]  # 样式

            agent = info.xpath(".//div[@class='con']/a/text()")[0]

            has = info.xpath(".//div[@class='left agency']//text()")

            price = info.xpath(".//div[@class='price']/span/text()")[0]
            price_pre = info.xpath(".//div[@class='price-pre']/text()")[0]
            look_num = info.xpath(".//div[@class='square']//span[@class='num']/text()")[0]

            one_data = {
                "region": region,
                "zone": zone,
                "meters": meters,
                "where": where,
                "louceng": floor,
                "type": type,
                "xiaoshou": agent,
                "has": has,
                "price": price,
                "price_pre": price_pre,
                "num": look_num
            }
            self._data.append(one_data)  # 添加数据

    def run(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(self.parse_html())]
        loop.run_until_complete(asyncio.wait(tasks))


if __name__ == '__main__':
    Lian_jia = LianjiaSpider()
    Lian_jia.run()

lianjia.csv

  ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,['集中供暖'],高楼层(共33层),['127.86平米\xa0\xa0'],0,2300,2018.11.08 更新,['凤凰城梧桐苑\xa0\xa0'],板楼,['南'],南焦租房,['3室2厅\xa0\xa0']
1,['集中供暖'],中楼层(共6层),['55平米\xa0\xa0'],0,1200,2018.11.04 更新,['华兴小区\xa0\xa0'],板楼,['南'],世纪公园租房,['1室1厅\xa0\xa0']
2,['集中供暖'],中楼层(共6层),['138平米\xa0\xa0'],0,2400,2018.11.04 更新,['河冶小区\xa0\xa0'],板楼,['南 北'],跃进租房,['3室2厅\xa0\xa0']
3,['集中供暖'],低楼层(共6层),['90平米\xa0\xa0'],1,1500,2018.11.06 更新,['瑞国花园\xa0\xa0'],板楼,['南'],跃进租房,['2室2厅\xa0\xa0']
4,['集中供暖'],低楼层(共14层),['180平米\xa0\xa0'],0,3500,2018.11.13 更新,['华脉新村\xa0\xa0'],板楼,['南 北'],四十中学租房,['4室2厅\xa0\xa0']
5,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['57平米\xa0\xa0'],0,3000,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['西'],南长租房,['1室1厅\xa0\xa0']
6,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['42.56平米\xa0\xa0'],0,2200,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
7,['集中供暖'],中楼层(共34层),['148平米\xa0\xa0'],0,2500,2018.11.08 更新,['北城国际B区\xa0\xa0'],板楼,['南 北'],沿东租房,['3室2厅\xa0\xa0']
8,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['40.09平米\xa0\xa0'],0,2100,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
9,"['近地铁', '集中供暖']",低楼层(共33层),['185平米\xa0\xa0'],0,22000,2018.11.10 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']
10,"['近地铁', '集中供暖']",低楼层(共33层),['242平米\xa0\xa0'],0,29000,2018.11.05 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']

去除无用字符

import re

f = open("lian_jia.csv", 'w', encoding='utf-8')
filename = 'lianjia.csv'
with open(filename, 'r', encoding='utf-8')as file:
    frd = file.readlines()
    for i in frd:
        pattern = re.compile(r'xa0')
        out = re.sub(pattern, '', i)
        s = "".join("".join("".join(out.split("\\\\")).split("']")).split("['"))
        d = "".join(s.split("平米"))
        f.write(d)

lian_jia.csv

 ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,集中供暖,高楼层(共33层),127.86,0,2300,2018.11.08 更新,凤凰城梧桐苑,板楼,南,南焦租房,3室2厅
1,集中供暖,中楼层(共6层),55,0,1200,2018.11.04 更新,华兴小区,板楼,南,世纪公园租房,1室1厅
2,集中供暖,中楼层(共6层),138,0,2400,2018.11.04 更新,河冶小区,板楼,南 北,跃进租房,3室2厅
3,集中供暖,低楼层(共6层),90,1,1500,2018.11.06 更新,瑞国花园,板楼,南,跃进租房,2室2厅
4,集中供暖,低楼层(共14层),180,0,3500,2018.11.13 更新,华脉新村,板楼,南 北,四十中学租房,4室2厅
5,"近地铁', '随时看房', '集中供暖",中楼层(共40层),57,0,3000,2018.11.09 更新,华润大厦,塔楼,西,南长租房,1室1厅
6,"近地铁', '随时看房', '集中供暖",中楼层(共40层),42.56,0,2200,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
7,集中供暖,中楼层(共34层),148,0,2500,2018.11.08 更新,北城国际B区,板楼,南 北,沿东租房,3室2厅
8,"近地铁', '随时看房', '集中供暖",中楼层(共40层),40.09,0,2100,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
9,"近地铁', '集中供暖",低楼层(共33层),185,0,22000,2018.11.10 更新,青鸟中山华府,板楼,北,大经租房,1室1厅
10,"近地铁', '集中供暖",低楼层(共33层),242,0,29000,2018.11.05 更新,青鸟中山华府,板楼,北,大经租房,1室1厅

可视化户型数量分布

import pandas as pd
import matplotlib.pyplot as plt

house = pd.read_csv('lian_jia.csv', names=['', 'has', 'louceng', 'meters', 'num', 'price', 'price_pre', 'region', 'type', 'where','xiaoshou', 'zone'])
zone = house['zone'].value_counts()
plt.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
asd, sdf = plt.subplots(1, 1, dpi=200)  # 设置画布
zone.head(10).plot(kind='bar', x='zone', y='size', title='户型数量分布', ax=sdf)  # 获取前10条数据
plt.legend(['数量'])
plt.show()

柱状图

爬取链家租房数据,数据处理,进行可视化分析_第1张图片

 

你可能感兴趣的:(爬虫,数据分析,数据可视化)