异步协程抓取链家数据+pandas写入csv
import asyncio
import aiohttp
import pandas
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class LJSpider(object):
def __init__(self):
self._ua = UserAgent()
self._headers = {"User-Agent": self._ua.random}
self._data_list = []
async def get(self,url):
"""
aiohttp异步协程 发出请求并返回response
:param url:
:return:
"""
async with aiohttp.ClientSession() as session:
try:
async with session.get(url,headers = self._headers,timeout = 3) as response:
if response.status == 200:
return await response.text()
except Exception as e:
print(e)
async def parse(self):
for i in range(0,6):
url = 'https://bj.lianjia.com/zufang/pg{}'.format(i)
resp = await self.get(url)
# 数据解析
self.parse_details(resp)
# print(resp)
#数据存储
data = pandas.DataFrame(self._data_list)
data.to_csv("链家网租房数据.csv", encoding='utf_8_sig')
def parse_details(self,resp):
bs = BeautifulSoup(resp, 'html.parser')
bs_list = bs.find_all('div',attrs={'class':'content__list--item'})
for i in range(0,len(bs_list)):
data=bs_list[i]
# 标题
title = data.find('a',attrs={'class':'twoline'}).text
# 租金
price = data.find('span',attrs={'class':'content__list--item-price'}).text
#描述信息
des_list = data.find('p',attrs={'class':'content__list--item--des'}).text.split('/')
part = self.delete_space(des_list[0])
area = self.delete_space(des_list[1])
direction = self.delete_space(des_list[2])
house_type = self.delete_space(des_list[3])
storey = self.delete_space(des_list[4])
# 房源类型
owner = data.find('i',attrs={'class':'content__item__tag--owner_reco'})
owner_reco = owner.text if owner else''
print('房源类型:',owner_reco)
#供暖类型
central = data.find('i',attrs={'class':'content__item__tag--central_heating'})
central_h = central.text if central else ''
# 标签
tag = data.find('p', attrs={'class': 'content__list--item--bottom oneline'})
tag_has = tag.text if tag else ''
# 信息来源
brand = data.find('span',attrs={'class':'brand'}).text
# 维护时间
time_onl = data.find('span',attrs={'class':'content__list--item--time oneline'}).text
one_data = {
"title": self.delete_space(title),
"price": self.delete_space(price),
'tag_has': self.delete_space(tag_has),
'part' :part,
'area': area,
'direction':direction,
"house_type" :house_type,
'storey': storey,
"owner_reco": self.delete_space(owner_reco),
"central_h": self.delete_space(central_h),
"brand": self.delete_space(brand),
"time_onl": self.delete_space(time_onl),
}
print('房源信息。。。。。:', one_data)
self._data_list.append(one_data) # 添加数据
def delete_space(self,str = ''):
"数据清洗"
str = str.replace('\n',' ').replace(' ','').lstrip().rstrip()
return str
def run(self):
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(self.parse())]
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
l = LJSpider()
l.run()