Python异步并发爬虫-----gevent库爬取知名问答网站问题列表

掌握 Gevent库的使用。

经验:
1、猴子补丁的使用。
from gevent import monkey;monkey.patch_all() 必须写上,否则效果同单线程。
2、知乎网站采用了Ajax技术动态传递数据,Chrom浏览器F12,在Network菜单下的XHR栏下找到相应个的链接,其返回数据为字典和列表多重嵌套的字典。乍看有点复杂,需要理清结构。
3、需要加入cookies信息,并且请求时需要传递一些参数,才可以获得数据。详见如下代码。

import json
import time
from gevent import monkey
monkey.patch_all()
import gevent
import requests
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': 'https://www.zhihu.com/'}
cookies={'zap':'da8ecdcd-d34f-4a63-b3d1-359c9bac8b39','xsrf':'13d0d229-8dcb-433f-b9d7-0915f123a85d','d_c0':"AIDvme8HYhCPTkMTA2o7iEEo-wjVcXhxhHE=|1574236655",
         'l_n_c':'1','o_act':'login','ref_source':"zhuanlan",'n_c':'1','z_c0':"2|1:0|10:1575538804|4:z_c0|92:Mi4xbzFhekRRQUFBQUFBZ08tWjd3ZGlFQ1lBQUFCZ0FsVk5kQjdXWGdDVjdPYWJjd19qYzVFZ2VmaG9CLUk1N2g3Qm9n|429fa560ec2c44f45d04e07115865de1c924457fcc036dc5ffc6ab09396ddbe0",
         'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49':'1580886183,1582197434,1582555670,1582556436', 'tst':'r', 'q_c1':'14a6c46b4b1b4bb6a33b13a47f75dbb5|1582763822000|1575538817000',
'KLBRSID':'4843ceb2c0de43091e0ff7c22eadca8c|1582768246|1582762619', 'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49':'1582768257'}
url='https://www.zhihu.com/api/v3/feed/topstory/recommend'


def downloader(page): #下载函数
    params = {'session_token': '6efd64f3eb88474475e2314d0703846c',
              'desktop': 'true',
              'page_number': page,
              'limit': '6',
              'action': 'down',
              'after_id': 2*(page-1)-1}
    print("开始下载")
    html = requests.get(url, params=params, headers=headers,cookies=cookies)
    time.sleep(1)
    get_data(html.json())

def get_data(resp):#数据解析函数
    d=resp['data']
    for i in d:
        if 'target' in i and i['target']['type']=='answer': #知乎的列表里有些不是常规的问题回答,所以这里要判断,以免出错
            comment_count=i['target']['comment_count']
            author=i['target']['author']['name']
            content=i['target']['content']
            voteup_count = i['target']['voteup_count']
            answer_count = i['target']['question']['answer_count']
            id = i['target']['question']['id']
            title = i['target']['question']['title']
            url = 'https://www.zhihu.com/question/' + str(id) + '/answer/' + str(i['target']['id'])
            print("作者",':',author,'发布时间:',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(i['created_time'])),"问题:",title,'问题编号:',id,"评价数:",comment_count,"回答数:",answer_count,"赞同数:",voteup_count,'问题链接:',url)

if __name__=="__main__":
    t1 = time.time()
    greenlets=[gevent.spawn(downloader,i) for i in range(60)]
    gevent.joinall(greenlets)
    # for i in range(60): #单线程
    #     downloader(i)
    t2=time.time()
    print("Time Used:",t2-t1) #Time: 3.8012173175811768  单线程:Time Used: 101.60381150245667

你可能感兴趣的:(python爬虫)