链家网开源java_异步协程爬取链家租房信息

异步协程抓取链家数据+pandas写入csv

import asyncio

import aiohttp

import pandas

from bs4 import BeautifulSoup

from fake_useragent import UserAgent

class LJSpider(object):

def __init__(self):

self._ua = UserAgent()

self._headers = {"User-Agent": self._ua.random}

self._data_list = []

async def get(self,url):

"""

aiohttp异步协程 发出请求并返回response

:param url:

:return:

"""

async with aiohttp.ClientSession() as session:

try:

async with session.get(url,headers = self._headers,timeout = 3) as response:

if response.status == 200:

return await response.text()

except Exception as e:

print(e)

async def parse(self):

for i in range(0,6):

url = 'https://bj.lianjia.com/zufang/pg{}'.format(i)

resp = await self.get(url)

# 数据解析

self.parse_details(resp)

# print(resp)

#数据存储

data = pandas.DataFrame(self._data_list)

data.to_csv("链家网租房数据.csv", encoding='utf_8_sig')

def parse_details(self,resp):

bs = BeautifulSoup(resp, 'html.parser')

bs_list = bs.find_all('div',attrs={'class':'content__list--item'})

for i in range(0,len(bs_list)):

data=bs_list[i]

# 标题

title = data.find('a',attrs={'class':'twoline'}).text

# 租金

price = data.find('span',attrs={'class':'content__list--item-price'}).text

#描述信息

des_list = data.find('p',attrs={'class':'content__list--item--des'}).text.split('/')

part = self.delete_space(des_list[0])

area = self.delete_space(des_list[1])

direction = self.delete_space(des_list[2])

house_type = self.delete_space(des_list[3])

storey = self.delete_space(des_list[4])

# 房源类型

owner = data.find('i',attrs={'class':'content__item__tag--owner_reco'})

owner_reco = owner.text if owner else''

print('房源类型:',owner_reco)

#供暖类型

central = data.find('i',attrs={'class':'content__item__tag--central_heating'})

central_h = central.text if central else ''

# 标签

tag = data.find('p', attrs={'class': 'content__list--item--bottom oneline'})

tag_has = tag.text if tag else ''

# 信息来源

brand = data.find('span',attrs={'class':'brand'}).text

# 维护时间

time_onl = data.find('span',attrs={'class':'content__list--item--time oneline'}).text

one_data = {

"title": self.delete_space(title),

"price": self.delete_space(price),

'tag_has': self.delete_space(tag_has),

'part' :part,

'area': area,

'direction':direction,

"house_type" :house_type,

'storey': storey,

"owner_reco": self.delete_space(owner_reco),

"central_h": self.delete_space(central_h),

"brand": self.delete_space(brand),

"time_onl": self.delete_space(time_onl),

}

print('房源信息。。。。。:', one_data)

self._data_list.append(one_data) # 添加数据

def delete_space(self,str = ''):

"数据清洗"

str = str.replace('\n',' ').replace(' ','').lstrip().rstrip()

return str

def run(self):

loop = asyncio.get_event_loop()

tasks = [asyncio.ensure_future(self.parse())]

loop.run_until_complete(asyncio.wait(tasks))

if __name__ == '__main__':

l = LJSpider()

l.run()

你可能感兴趣的:(链家网开源java)