结果图镇楼。无图无真相。。。。嘿嘿
参考了一篇链家石家庄的文章,但是那篇已经没法用了规则变了,我又重新写了一份。
https://blog.csdn.net/hihell/article/details/84029492
import re
from fake_useragent import UserAgent
from lxml import etree
import asyncio
import aiohttp
import pandas as pd
# 定义一个类 定义使用的变量 定义get方法通过连接池进行网络请求
class LianjiaSpider(object):
def __init__(self):
self.ua = UserAgent() # 获取userAgent类
self.head = {"User-Agent": self.ua.random}
self._data = list() # 初始化list
async def get_page_count(self):
result = await self.get("https://bj.lianjia.com/zufang/pg1")
page_html = etree.HTML(result) # 解析网页
pageCount = page_html.xpath(".//div[@class='content__pg']/@data-totalpage")
pageCount = list(map(int, pageCount))
return pageCount[0]
async def get(self, url): # 异步方法 当方法执行挂起线程执行完毕返回当前执行
async with aiohttp.ClientSession() as session: # 线程连接池
try:
async with session.get(url, headers=self.head, timeout=3) as resp:
if resp.status == 200:
result = await resp.text()
return result
except Exception as e:
print(e.args)
async def parse_html(self):
count = await self.get_page_count()
for page in range(1, count):
url = "https://bj.lianjia.com/zufang/pg{}/".format(page)
print("正在爬取{}".format(url))
html = await self.get(url) # 获取网页内容
html = etree.HTML(html) # 解析网页
await self.parse_page(html) # 匹配我们想要的数据
print("正在存储数据....")
print(len(self._data))
######################### 数据写入
data = pd.DataFrame(self._data)
data.to_csv("链家网租房数据.csv", encoding='utf_8_sig') # 写入文件
######################### 数据写入
def run(self):
loop = asyncio.get_event_loop() # 获取到循环
tasks = [asyncio.ensure_future(self.parse_html())] # 创建任务
loop.run_until_complete(asyncio.wait(tasks))
async def parse_page(self, html):
rst = html.xpath(".//div[@class='content__list--item']") # //代表在任意路径下查找节点为div,class为的所有元素
print(rst) # ==> [, , ] 找
for div in rst:
imgurl = div.xpath(".//a[@class='content__list--item--aside']/img/@src")
title = div.xpath(".//a[@class='content__list--item--aside']/img/@alt")
floor = div.xpath(".//span[@class='hide']/text()")
price = div.xpath(".//span[@class='content__list--item-price']/em/text()")
type = div.xpath(".//p[@class='content__list--item--des']/text()")
if len(floor) > 0: # 有的没有写楼层会报错加一层判断
currentFloor = floor[1].replace("\n", "").replace(" ", "")
else:
currentFloor = ''
strinfo = [] #用于存储多少平方米 朝向 几室几厅
strinfo.clear()
for str in type:
info = str.replace(" ", "").replace("\n", "").replace("-", "")
if info != '':
strinfo.append(info)
print(info)
size = strinfo[0].replace(" ", "").replace("\n", "") # 30㎡
direction = strinfo[1].replace(" ", "").replace("\n", "") # 南
structure = strinfo[2].replace(" ", "").replace("\n", "") # 5室1厅2卫
structure = re.findall(r'\d+', structure)
print(structure)
print("imgurl:" + imgurl[0]) # 图片地址
print("title:" + title[0]) # 标题
print("price:" + price[0]) # 价钱
print("currentFloor:" + currentFloor) # 楼层
print(structure) # 分割几室几厅几卫
if len(structure) == 3:
one_data = {
"图片地址": imgurl[0],
"标题": title[0],
"价格": price[0],
"楼层": currentFloor,
"大小": size,
"朝向": direction,
"室": structure[0],
"厅": structure[1],
"卫": structure[2]
}
elif len(structure) == 2:
one_data = {
"图片地址": imgurl[0],
"标题": title[0],
"价格": price[0],
"楼层": currentFloor,
"大小": size,
"朝向": direction,
"室": structure[0],
"厅": 0,
"卫": structure[1]
}
self._data.append(one_data) # 添加数据
if __name__ == '__main__':
l = LianjiaSpider()
l.run()
搜索for循环
替换字符串
len长度函数
etree 根据class 解析,
\d正则表达提取数字
fake_useragent 模拟head的使用
协程的使用
list中str转为int map的使用 list(map(int,strList)) list清除的方法 clear
if elseif 使用