爬取博客详细页面的标题(python3.5以上,async/await,aiohttp)

因为公司使用python2.x,自己没事儿学了一下python3.X觉得挺有意思的,于是写一下爬虫看看效率,哈哈哈........

aiohttp是一个第三方异步的http库,感觉还不错,主要是requests是阻塞的

代码很简单,还是以自己的博客为例子:

# -*-coding:utf-8-*-
"""
ayou
"""
from bs4 import BeautifulSoup as bs
import asyncio
import aiohttp
import time

#async,协程对象
async def getPage(url,res_list,callback=None):
    print(url)
    headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
    #asyncio.Semaphore(),限制同时运行协程数量
    sem = asyncio.Semaphore(5)
    with (await sem):
        async with aiohttp.ClientSession() as session:
            async with session.get(url,headers=headers) as resp:
                #断言,判断网站状态
                assert resp.status==200
                #判断不同回调函数做处理
                if callback==grabPage:
                    body = await resp.text()
                    callback(res_list,body)
                elif callback==grabPage1:
                    body = await resp.text()
                    callback(body)
                else:
                    return await resp.text()
                #关闭请求
                session.close()

#解析页面拿到博客url
def grabPage(res_list,body):
    page = bs(body,"lxml")
    articles = page.find_all('div', attrs={'class': 'article_title'})
    for a in articles:
        x = a.find('a')['href']
        # print('http://blog.csdn.net' + x)
        res_list.add('http://blog.csdn.net' + x)

#拿到博客页面的标题
def grabPage1(body):
    page = bs(body,"lxml")
    articles = page.find("title")
    print(articles.text)

start = time.time()

#博客列表页面总页数
page_num = 4
#起始页面
page_url_base = 'http://blog.csdn.net/u013055678/article/list/'
#列表页面的列表
page_urls = [page_url_base + str(i+1) for i in range(page_num)]
#asyncio.get_event_loop(),创建事件循环
loop = asyncio.get_event_loop()
#用来储存所有博客详细页URL
ret_list = set()
#协程任务,获得所有博客详细页面并存到set中
tasks = [getPage(host,ret_list, callback=grabPage) for host in page_urls]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))

#协程任务,获得博客详细页面的标题
tasks = [getPage(url, ret_list, callback=grabPage1) for url in ret_list]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))

#关闭事件循环
loop.close()

print("Elapsed Time: %s" % (time.time() - start))

用时:



对比之前博客中的多线程爬虫的速度差不多,比多进程慢一点!

async替代python3.4中的@asyncio.coroutine,用await替代yield from

async替代tornado中的@gen.coroutine,用await替代yield


用一下 redis:

# -*-coding:utf-8-*-
"""
ayou
"""
import redis

class redisTools(object):
    def __init__(self, **key):
        self.pool = redis.ConnectionPool(**key)
        self.r = redis.StrictRedis(connection_pool=self.pool)

    #可以存储任意格式
    def setData(self,keyname,data):
        data = self.r.set(keyname,data)
        return data

    #取数据
    def getData(self,keyname,coding="utf-8"):
        data = self.r.get(keyname)
        data = data.decode(coding)
        return data

    # 取数据并删除
    def getDataDel(self, keyname, coding="utf-8"):
        data = self.r.get(keyname)
        data = data.decode(coding)
        #删除
        self.r.delete(keyname)
        return data

    #只保存属性值,key对应多个属性
    def setValue(self,keyname,data):
        data = self.r.lpush(keyname, data)
        return data

    #取出属性值,并删除
    def getValue(self,keyname,coding="utf-8"):
        data = self.r.brpop(keyname, 0)[1]
        data = data.decode(coding)
        return data

    #以键值对形式保存属性名和属性值,key对应多个属性
    def setKeyValue(self,keyname,datakey, data):
        state = self.r.hset(keyname, datakey, data)
        if state==0:
            return True
        else:
            return False

    # 取出属性值
    def getKeyValue(self,keyname, datakey,coding="utf-8"):
        data = self.r.hget(keyname, datakey)
        data = data.decode(coding)
        return data

    # 取出属性值并删除
    def getKeyValueDel(self,keyname, datakey,coding="utf-8"):
        data = self.r.hget(keyname, datakey)
        data = data.decode(coding)
        #删除
        self.r.hdel(keyname, datakey)
        return data

    #根据属性名删属性值
    def delAttribute(self,keyname,datakey):
        hdel = self.r.hdel(keyname,datakey)
        if hdel==1:
            return True
        else:
            return False

    #获得key下面所有属性名
    def getKeyAllAttribute(self,keyname):
        hkeys = self.r.hkeys(keyname)
        return hkeys

    #获得所有key的名称
    def getKey(self):
        keys = self.r.keys()
        return keys

    #获得同一个key还有多少
    def getLen(self,keyname):
        llen = self.r.llen(keyname)
        return llen

    #判断key是否存在
    def getExists(self,keyname):
        exists = self.r.exists(keyname)
        return exists

    #获得key的数量
    def getDbsize(self):
        dbsize = self.r.dbsize()
        return dbsize

    #删除key
    def deleteKy(self,keyname):
        delete = self.r.delete(keyname)
        if delete==1:
            return True
        else:
            return False

    # 删除当前数据库的所有数据
    def flushDB(self):
        flushdb = self.r.flushdb()
        return flushdb

    #======集合==========
    #添加数据,因为是集合所以有去重功能,返回添加了多少
    def setSets(self,keyname,*data):
        return self.r.sadd(keyname,*data)

    #取出集合,如果只有一个返回字符串,如果多个返回列表
    def getSets(self,keyname, coding="utf-8"):
        data = self.r.smembers(keyname)
        if len(data) == 1:
            return list(data)[0].decode(coding)
        else:
            data = [d.decode(coding) for d in data]
            return data

    # 取出集合,如果只有一个返回字符串,如果多个返回列表,最后删除
    def getSetsDel(self,keyname, coding="utf-8"):
        data = self.r.smembers(keyname)
        if len(data) == 1:
            data = list(data)[0].decode(coding)
            self.r.srem(keyname, data)
            return data
        else:
            data = [d.decode(coding) for d in data]
            [self.r.srem(keyname, d) for d in data]
            return data

    #删除集合的元素,返回删除了多少
    def setsDel(self,keyname,*data):
        return self.r.srem(keyname, data)

    #判断元素是否存在
    def isExist(self, keyname, data):
        return self.r.sismember(keyname, data)

    #集合长度
    def setsLen(self , keyname):
        return self.r.scard(keyname)

    #多个集合的交集,返回列表
    def setsIntersection(self, *keyname):
        data = self.r.sinter(keyname)
        data = [d.decode("utf-8") for d in data]
        return data

    #多个集合的并集,返回列表
    def setsAndSet(self,*keyname):
        data = self.r.sunion(keyname)
        data = [d.decode("utf-8") for d in data]
        return data

使用redis的爬虫:

# -*-coding:utf-8-*-
"""
ayou
"""
from bs4 import BeautifulSoup as bs
import asyncio
import aiohttp
import time
from redisTools import redisTools

#async,协程对象
async def getPage(url,res_list,body_list,callback=None):
    print(url)
    headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
    #asyncio.Semaphore(),限制同时运行协程数量
    sem = asyncio.Semaphore(5)
    with (await sem):
        async with aiohttp.ClientSession() as session:
            async with session.get(url,headers=headers) as resp:
                #断言,判断网站状态
                assert resp.status==200
                #判断不同回调函数做处理
                if callback==grabPage:
                    body = await resp.text()
                    body_list.setSets("bodylist",body)
                    callback(res_list)
                elif callback==grabPage1:
                    body = await resp.text()
                    body_list.setSets("bodyxx", body)
                    callback(body_list)
                else:
                    return await resp.text()
                #关闭请求
                session.close()

#解析页面拿到博客url
def grabPage(res_list):
    body = res_list.getSetsDel("bodylist")
    # print(type(body))
    page = bs(body,"lxml")
    articles = page.find_all('div', attrs={'class': 'article_title'})
    for a in articles:
        x = a.find('a')['href']
        # print('http://blog.csdn.net' + x)
        res_list.setSets("xxurl",'http://blog.csdn.net' + x)

#拿到博客页面的标题
def grabPage1(res_list):
    body = res_list.getSetsDel("bodyxx")
    # print(body)
    page = bs(body,"lxml")
    articles = page.find("title")
    print(articles.text)
    res_list.setSets("title",articles.text)

start = time.time()

#博客列表页面总页数
page_num = 4
#起始页面
page_url_base = 'http://blog.csdn.net/u013055678/article/list/'
#列表页面的列表
page_urls = [page_url_base + str(i+1) for i in range(page_num)]
#asyncio.get_event_loop(),创建事件循环
loop = asyncio.get_event_loop()
#用来储存所有博客详细页URL

rt = redisTools()
#协程任务,获得所有博客详细页面并存到set中
tasks = [getPage(host,rt, rt,callback=grabPage) for host in page_urls]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))
ret_list = rt.getSetsDel("xxurl")
#协程任务,获得博客详细页面的标题
tasks = [getPage(url, ret_list,rt, callback=grabPage1) for url in ret_list]
#在事件循环中执行协程程序
loop.run_until_complete(asyncio.gather(*tasks))

#关闭事件循环
loop.close()

print("Elapsed Time: %s" % (time.time() - start))





你可能感兴趣的:(python爬虫)