因为公司使用python2.x,自己没事儿学了一下python3.X觉得挺有意思的,于是写一下爬虫看看效率,哈哈哈........
aiohttp是一个第三方异步的http库,感觉还不错,主要是requests是阻塞的
代码很简单,还是以自己的博客为例子:
# -*-coding:utf-8-*-
"""
ayou
"""
from bs4 import BeautifulSoup as bs
import asyncio
import aiohttp
import time
#async,协程对象
async def getPage(url,res_list,callback=None):
print(url)
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
#asyncio.Semaphore(),限制同时运行协程数量
sem = asyncio.Semaphore(5)
with (await sem):
async with aiohttp.ClientSession() as session:
async with session.get(url,headers=headers) as resp:
#断言,判断网站状态
assert resp.status==200
#判断不同回调函数做处理
if callback==grabPage:
body = await resp.text()
callback(res_list,body)
elif callback==grabPage1:
body = await resp.text()
callback(body)
else:
return await resp.text()
#关闭请求
session.close()
#解析页面拿到博客url
def grabPage(res_list,body):
page = bs(body,"lxml")
articles = page.find_all('div', attrs={'class': 'article_title'})
for a in articles:
x = a.find('a')['href']
# print('http://blog.csdn.net' + x)
res_list.add('http://blog.csdn.net' + x)
#拿到博客页面的标题
def grabPage1(body):
page = bs(body,"lxml")
articles = page.find("title")
print(articles.text)
start = time.time()
#博客列表页面总页数
page_num = 4
#起始页面
page_url_base = 'http://blog.csdn.net/u013055678/article/list/'
#列表页面的列表
page_urls = [page_url_base + str(i+1) for i in range(page_num)]
#asyncio.get_event_loop(),创建事件循环
loop = asyncio.get_event_loop()
#用来储存所有博客详细页URL
ret_list = set()
#协程任务,获得所有博客详细页面并存到set中
tasks = [getPage(host,ret_list, callback=grabPage) for host in page_urls]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))
#协程任务,获得博客详细页面的标题
tasks = [getPage(url, ret_list, callback=grabPage1) for url in ret_list]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))
#关闭事件循环
loop.close()
print("Elapsed Time: %s" % (time.time() - start))
用时:
对比之前博客中的多线程爬虫的速度差不多,比多进程慢一点!
async替代python3.4中的@asyncio.coroutine,用await替代yield from
async替代tornado中的@gen.coroutine,用await替代yield
用一下 redis:
# -*-coding:utf-8-*-
"""
ayou
"""
import redis
class redisTools(object):
def __init__(self, **key):
self.pool = redis.ConnectionPool(**key)
self.r = redis.StrictRedis(connection_pool=self.pool)
#可以存储任意格式
def setData(self,keyname,data):
data = self.r.set(keyname,data)
return data
#取数据
def getData(self,keyname,coding="utf-8"):
data = self.r.get(keyname)
data = data.decode(coding)
return data
# 取数据并删除
def getDataDel(self, keyname, coding="utf-8"):
data = self.r.get(keyname)
data = data.decode(coding)
#删除
self.r.delete(keyname)
return data
#只保存属性值,key对应多个属性
def setValue(self,keyname,data):
data = self.r.lpush(keyname, data)
return data
#取出属性值,并删除
def getValue(self,keyname,coding="utf-8"):
data = self.r.brpop(keyname, 0)[1]
data = data.decode(coding)
return data
#以键值对形式保存属性名和属性值,key对应多个属性
def setKeyValue(self,keyname,datakey, data):
state = self.r.hset(keyname, datakey, data)
if state==0:
return True
else:
return False
# 取出属性值
def getKeyValue(self,keyname, datakey,coding="utf-8"):
data = self.r.hget(keyname, datakey)
data = data.decode(coding)
return data
# 取出属性值并删除
def getKeyValueDel(self,keyname, datakey,coding="utf-8"):
data = self.r.hget(keyname, datakey)
data = data.decode(coding)
#删除
self.r.hdel(keyname, datakey)
return data
#根据属性名删属性值
def delAttribute(self,keyname,datakey):
hdel = self.r.hdel(keyname,datakey)
if hdel==1:
return True
else:
return False
#获得key下面所有属性名
def getKeyAllAttribute(self,keyname):
hkeys = self.r.hkeys(keyname)
return hkeys
#获得所有key的名称
def getKey(self):
keys = self.r.keys()
return keys
#获得同一个key还有多少
def getLen(self,keyname):
llen = self.r.llen(keyname)
return llen
#判断key是否存在
def getExists(self,keyname):
exists = self.r.exists(keyname)
return exists
#获得key的数量
def getDbsize(self):
dbsize = self.r.dbsize()
return dbsize
#删除key
def deleteKy(self,keyname):
delete = self.r.delete(keyname)
if delete==1:
return True
else:
return False
# 删除当前数据库的所有数据
def flushDB(self):
flushdb = self.r.flushdb()
return flushdb
#======集合==========
#添加数据,因为是集合所以有去重功能,返回添加了多少
def setSets(self,keyname,*data):
return self.r.sadd(keyname,*data)
#取出集合,如果只有一个返回字符串,如果多个返回列表
def getSets(self,keyname, coding="utf-8"):
data = self.r.smembers(keyname)
if len(data) == 1:
return list(data)[0].decode(coding)
else:
data = [d.decode(coding) for d in data]
return data
# 取出集合,如果只有一个返回字符串,如果多个返回列表,最后删除
def getSetsDel(self,keyname, coding="utf-8"):
data = self.r.smembers(keyname)
if len(data) == 1:
data = list(data)[0].decode(coding)
self.r.srem(keyname, data)
return data
else:
data = [d.decode(coding) for d in data]
[self.r.srem(keyname, d) for d in data]
return data
#删除集合的元素,返回删除了多少
def setsDel(self,keyname,*data):
return self.r.srem(keyname, data)
#判断元素是否存在
def isExist(self, keyname, data):
return self.r.sismember(keyname, data)
#集合长度
def setsLen(self , keyname):
return self.r.scard(keyname)
#多个集合的交集,返回列表
def setsIntersection(self, *keyname):
data = self.r.sinter(keyname)
data = [d.decode("utf-8") for d in data]
return data
#多个集合的并集,返回列表
def setsAndSet(self,*keyname):
data = self.r.sunion(keyname)
data = [d.decode("utf-8") for d in data]
return data
# -*-coding:utf-8-*-
"""
ayou
"""
from bs4 import BeautifulSoup as bs
import asyncio
import aiohttp
import time
from redisTools import redisTools
#async,协程对象
async def getPage(url,res_list,body_list,callback=None):
print(url)
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
#asyncio.Semaphore(),限制同时运行协程数量
sem = asyncio.Semaphore(5)
with (await sem):
async with aiohttp.ClientSession() as session:
async with session.get(url,headers=headers) as resp:
#断言,判断网站状态
assert resp.status==200
#判断不同回调函数做处理
if callback==grabPage:
body = await resp.text()
body_list.setSets("bodylist",body)
callback(res_list)
elif callback==grabPage1:
body = await resp.text()
body_list.setSets("bodyxx", body)
callback(body_list)
else:
return await resp.text()
#关闭请求
session.close()
#解析页面拿到博客url
def grabPage(res_list):
body = res_list.getSetsDel("bodylist")
# print(type(body))
page = bs(body,"lxml")
articles = page.find_all('div', attrs={'class': 'article_title'})
for a in articles:
x = a.find('a')['href']
# print('http://blog.csdn.net' + x)
res_list.setSets("xxurl",'http://blog.csdn.net' + x)
#拿到博客页面的标题
def grabPage1(res_list):
body = res_list.getSetsDel("bodyxx")
# print(body)
page = bs(body,"lxml")
articles = page.find("title")
print(articles.text)
res_list.setSets("title",articles.text)
start = time.time()
#博客列表页面总页数
page_num = 4
#起始页面
page_url_base = 'http://blog.csdn.net/u013055678/article/list/'
#列表页面的列表
page_urls = [page_url_base + str(i+1) for i in range(page_num)]
#asyncio.get_event_loop(),创建事件循环
loop = asyncio.get_event_loop()
#用来储存所有博客详细页URL
rt = redisTools()
#协程任务,获得所有博客详细页面并存到set中
tasks = [getPage(host,rt, rt,callback=grabPage) for host in page_urls]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))
ret_list = rt.getSetsDel("xxurl")
#协程任务,获得博客详细页面的标题
tasks = [getPage(url, ret_list,rt, callback=grabPage1) for url in ret_list]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))
#关闭事件循环
loop.close()
print("Elapsed Time: %s" % (time.time() - start))