lianjiaspider.py
import asyncio
import aiohttp
import pandas as pd
from lxml import etree
class LianjiaSpider(object):
def __init__(self):
self._headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
self._data = list()
async def get(self, url):
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, headers=self._headers, timeout=3) as resp:
if resp.status == 200:
result = await resp.text()
return result
except Exception as e:
print(e.args)
async def parse_html(self):
for page in range(1, 77):
url = "https://sjz.lianjia.com/zufang/pg{}/".format(page)
print("正在爬取{}".format(url))
html = await self.get(url) # 获取网页内容
html = etree.HTML(html)
self.parse_page(html)
print("正在存储数据....")
data = pd.DataFrame(self._data)
data.to_csv("lianjia.csv", encoding='utf_8_sig') # 写入文件
def parse_page(self, html):
info_panel = html.xpath("//div[@class='info-panel']")
for info in info_panel:
region = info.xpath(".//span[@class='region']/text()")
zone = info.xpath(".//span[@class='zone']/span/text()")
meters = info.xpath(".//span[@class='meters']/text()")
where = info.xpath(".//div[@class='where']/span[4]/text()")
con = info.xpath(".//div[@class='con']/text()")
floor = con[0] # 楼层
type = con[1] # 样式
agent = info.xpath(".//div[@class='con']/a/text()")[0]
has = info.xpath(".//div[@class='left agency']//text()")
price = info.xpath(".//div[@class='price']/span/text()")[0]
price_pre = info.xpath(".//div[@class='price-pre']/text()")[0]
look_num = info.xpath(".//div[@class='square']//span[@class='num']/text()")[0]
one_data = {
"region": region,
"zone": zone,
"meters": meters,
"where": where,
"louceng": floor,
"type": type,
"xiaoshou": agent,
"has": has,
"price": price,
"price_pre": price_pre,
"num": look_num
}
self._data.append(one_data) # 添加数据
def run(self):
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(self.parse_html())]
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
Lian_jia = LianjiaSpider()
Lian_jia.run()
lianjia.csv
,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,['集中供暖'],高楼层(共33层),['127.86平米\xa0\xa0'],0,2300,2018.11.08 更新,['凤凰城梧桐苑\xa0\xa0'],板楼,['南'],南焦租房,['3室2厅\xa0\xa0']
1,['集中供暖'],中楼层(共6层),['55平米\xa0\xa0'],0,1200,2018.11.04 更新,['华兴小区\xa0\xa0'],板楼,['南'],世纪公园租房,['1室1厅\xa0\xa0']
2,['集中供暖'],中楼层(共6层),['138平米\xa0\xa0'],0,2400,2018.11.04 更新,['河冶小区\xa0\xa0'],板楼,['南 北'],跃进租房,['3室2厅\xa0\xa0']
3,['集中供暖'],低楼层(共6层),['90平米\xa0\xa0'],1,1500,2018.11.06 更新,['瑞国花园\xa0\xa0'],板楼,['南'],跃进租房,['2室2厅\xa0\xa0']
4,['集中供暖'],低楼层(共14层),['180平米\xa0\xa0'],0,3500,2018.11.13 更新,['华脉新村\xa0\xa0'],板楼,['南 北'],四十中学租房,['4室2厅\xa0\xa0']
5,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['57平米\xa0\xa0'],0,3000,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['西'],南长租房,['1室1厅\xa0\xa0']
6,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['42.56平米\xa0\xa0'],0,2200,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
7,['集中供暖'],中楼层(共34层),['148平米\xa0\xa0'],0,2500,2018.11.08 更新,['北城国际B区\xa0\xa0'],板楼,['南 北'],沿东租房,['3室2厅\xa0\xa0']
8,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['40.09平米\xa0\xa0'],0,2100,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
9,"['近地铁', '集中供暖']",低楼层(共33层),['185平米\xa0\xa0'],0,22000,2018.11.10 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']
10,"['近地铁', '集中供暖']",低楼层(共33层),['242平米\xa0\xa0'],0,29000,2018.11.05 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']
去除无用字符
import re
f = open("lian_jia.csv", 'w', encoding='utf-8')
filename = 'lianjia.csv'
with open(filename, 'r', encoding='utf-8')as file:
frd = file.readlines()
for i in frd:
pattern = re.compile(r'xa0')
out = re.sub(pattern, '', i)
s = "".join("".join("".join(out.split("\\\\")).split("']")).split("['"))
d = "".join(s.split("平米"))
f.write(d)
lian_jia.csv
,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,集中供暖,高楼层(共33层),127.86,0,2300,2018.11.08 更新,凤凰城梧桐苑,板楼,南,南焦租房,3室2厅
1,集中供暖,中楼层(共6层),55,0,1200,2018.11.04 更新,华兴小区,板楼,南,世纪公园租房,1室1厅
2,集中供暖,中楼层(共6层),138,0,2400,2018.11.04 更新,河冶小区,板楼,南 北,跃进租房,3室2厅
3,集中供暖,低楼层(共6层),90,1,1500,2018.11.06 更新,瑞国花园,板楼,南,跃进租房,2室2厅
4,集中供暖,低楼层(共14层),180,0,3500,2018.11.13 更新,华脉新村,板楼,南 北,四十中学租房,4室2厅
5,"近地铁', '随时看房', '集中供暖",中楼层(共40层),57,0,3000,2018.11.09 更新,华润大厦,塔楼,西,南长租房,1室1厅
6,"近地铁', '随时看房', '集中供暖",中楼层(共40层),42.56,0,2200,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
7,集中供暖,中楼层(共34层),148,0,2500,2018.11.08 更新,北城国际B区,板楼,南 北,沿东租房,3室2厅
8,"近地铁', '随时看房', '集中供暖",中楼层(共40层),40.09,0,2100,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
9,"近地铁', '集中供暖",低楼层(共33层),185,0,22000,2018.11.10 更新,青鸟中山华府,板楼,北,大经租房,1室1厅
10,"近地铁', '集中供暖",低楼层(共33层),242,0,29000,2018.11.05 更新,青鸟中山华府,板楼,北,大经租房,1室1厅
可视化户型数量分布
import pandas as pd
import matplotlib.pyplot as plt
house = pd.read_csv('lian_jia.csv', names=['', 'has', 'louceng', 'meters', 'num', 'price', 'price_pre', 'region', 'type', 'where','xiaoshou', 'zone'])
zone = house['zone'].value_counts()
plt.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
asd, sdf = plt.subplots(1, 1, dpi=200) # 设置画布
zone.head(10).plot(kind='bar', x='zone', y='size', title='户型数量分布', ax=sdf) # 获取前10条数据
plt.legend(['数量'])
plt.show()
柱状图