# 操作系统:win10 专业版
pycharm professional 2019.1
python 3.8
requests == 2.23.0
random # 内置的
import requests
from lxml import etree
import random
import pandas as pd
import numpy as np
import csv
# 起始url,首页的100页
urllist = ['https://sz.lianjia.com/zufang/pg{}/#contentList'.format(i) for i in range(1,100)]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"
}
# 代理池
paramslist = ['121.232.199.237:9000','125.108.67.254:9000','123.163.118.102:9999',
'125.108.67.254:9000','171.35.172.151:9999','123.101.231.234:9999',
'125.108.67.254:9000','123.163.118.102:9999','171.35.172.151:9999',
'123.101.231.234:9999','113.195.16.16:9999','175.44.109.145:9999',
'125.108.85.141:9000','175.43.32.21:9999','183.166.103.22:9999',
'125.110.96.80:9000','123.160.69.100:9999','112.111.217.69:9999',
'1.199.30.133:9999','123.55.102.150:9999','120.83.104.196:9999',
'180.118.128.138:9000','163.204.95.253:9999','113.195.18.89:9999',
'113.195.16.191:9999','175.42.129.76:9999','125.110.124.214:9000',
'125.110.102.54:9000','36.249.119.16:9999','125.110.89.240:9000',
'171.35.146.70:9999','124.93.201.59:42672','171.35.173.112:9999']
# 代理ip
param = {
'HTTP':paramslist[random.randint(0,len(paramslist))]}
for url in urllist:
response = requests.get(url=url,headers=headers,params=param)
res_html = response.content.decode()
# print(res_html)
# xpath使用准备
res_htmlx = etree.HTML(res_html)
# item = {}
# 取出信息
# 房租的价格
price = res_htmlx.xpath('//div[@class="content__list"]//span[@class="content__list--item-price"]/em/text()')
# print(price)
# 城区
distance = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/a[1]/text()')
# 小区
community = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/a[3]/text()')
# 面积
area_room = res_htmlx.xpath('//div[@class="content__list"]//p[@class="content__list--item--des"]/i[1]/following-sibling::node()')
# print(area)
# area = area[::7]
# print(area_room)
# 面积
area = []
# 房间格局
room = []
# 方向
direction =[]
for i in range(0,len(area_room),7):
# print(area_room[i].split()[0])
area.append(area_room[i].split()[0])
direction.append(area_room[i+2].split()[0])
room.append(area_room[i+4].split()[0])
with open("./data/shenzhenlianjia.csv",'a+') as f:
linajiawriter = csv.writer(f)
# 设置标题
linajiawriter.writerow(['城区','小区','面积','房间格局','方向','价格'])
for i in range(len(price)):
linajiawriter.writerow([distance[i],community[i],area[i],room[i],direction[i],price[i]])
个人记录,新手入门,多多学习,欢迎大家交流探讨!
来深圳一段时间,租房是个问题,就闲来无事做了一个网络爬虫,后面就会对这些爬虫爬出来的数据,进行简单 数据分析
主要是以下的几个维度
1、各个城区房源数量对比(横柱状图)
2、各种户型之间数量对比(横柱状图)
3、各个城区租金对比(柱状图和折线统计图)
4、积区间分布图(饼状图)
5、房屋朝向不同的房屋价格对比(柱状图和折线统计图)