利用协程爬取某网站

爬取详情页

import gevent
from gevent import monkey
monkey.patch_all()
import requests
import redis
from queue import Queue
import json

# 定义一个函数用于请求每一个id对应的详情页
def request_detail(poiId_queue,comment_queue):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    # 4567条   350
    # 定义一个列表,处理协程
    g_list = []
    for i in range(len(poiId_queue)):
        # 评论url
        comment_url = "https://www.meituan.com/meishi/api/poi/getMerchantComment?id=%d&offset=%d&pageSize=%d"
        # 每次抓取300条
        # 对于大于300的部分,取整作为总页数,对于小于300的部分取余作为余页中评论中的条数
        totalPage = int(comment_queue[i]/300)
        moreComment = comment_queue[i]%300
        if totalPage != 0:
            for page in range(0,totalPage):
                # 拼接url
                page_url = comment_url%(poiId_queue[i],page*300,300)
                print("正在请求:",page_url)
                g = gevent.spawn(func,page_url,headers,poiId_queue[i])
                # res = requests.get(url=page_url,headers=headers)
                g_list.append(g)
        if moreComment != 0:
                # 拼接url
                page_url = comment_url%(poiId_queue[i],(page+1)*300,moreComment)
                print("正在请求:",page_url)
                g = gevent.spawn(func, page_url, headers,poiId_queue[i])
                # res = requests.get(url=page_url,headers=headers)
                g_list.append(g)
        gevent.joinall(g_list)
# 定义一个函数,用于处理协程
def func(url,headers,poiId):
    res = requests.get(url=url,headers=headers)
    # 解析存储
    try:
        c_dict = json.loads(res.text)
        # 取出评论列表
        c_list = c_dict["data"]["comments"]
        # print(c_list)
        for c in c_list:
            item = {}
            item["poiId"] = poiId # 商户的Id
            item["user"] = c["userName"]
            item["comment"] = c["comment"]
            print(item)
            #   存入redis数据库
            print("%s已经存入redis数据库!"%item["user"])
            rds.lpush("foodComments",json.dumps(item))
    except Exception as e:
        print("该商家暂无评论信息!")


if __name__ == '__main__':
    # 1、从redis数据库中将详情页的id值提取出来放入到一个队列中
    rds = redis.StrictRedis(host="*******",port=6379,db=8)
    lens = rds.llen("foodlist")
    foodlist = rds.lrange("foodlist",0,lens )
    food_id_queue = []
    # 创建一个队列,用于存储评论的条数
    comment_queue = []
    for food in foodlist:
        # print(food)
        food_dict = json.loads(food)
        # food_dict["poiId"]
        food_id_queue.append(food_dict["poiId"])
        comment_queue.append(food_dict["comments"])

    # 调用详情页请求
    request_detail(poiId_queue=food_id_queue,comment_queue=comment_queue)

爬取主页面

import requests
import re
import threading
from time import sleep
import redis
from queue import Queue
import json

# 封装一个爬虫类,采用多线程机制
class CrawlThread(threading.Thread):
    def __init__(self,start_url,page_queue,data_queue,name):
        super().__init__()
        self.start_url = start_url
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.name = name

    def run(self):
        # 1)取页面编号 2)组合url发起请求 3)将响应的数据放入数据队列中
        # 问题:run函数由于是系统的回调函数,我们无法获取取返回值,我们如何从run函数中提取出请求的html页面源码?
        while True:
            if self.page_queue.empty():
                break
            # 出队一个编号
            page = self.page_queue.get()
            # 拼接url
            url = self.start_url + "pn" + str(page) + "/"
            # 发起请求
            print("当前线程为:%s,正在请求页面:%s"%(self.name,url))
            res = requests.get(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'})
            # 取出res中的html字符串,加入到数据队列中
            self.data_queue.put(res.text)
            print("当前已经入队了%d个页面"%self.data_queue.qsize())
            sleep(1)

# 定义一个解析类,采用多线程机制
class ParseThread(threading.Thread):
    def __init__(self,data_queue,name,page_queue):
        super().__init__()
        self.data_queue = data_queue
        self.name = name

    def run(self):
        while True:
            if self.data_queue.empty():
                continue
            # 1、从数据队列中取出一个页面的html
            html = self.data_queue.get()
            # 2、解析数据
            self.parse(html)
            # 3、判断所有的页面是否全部处理完,如果处理完了就退出当前线程
            if page_queue.empty():
                break
    # 封装一个函数用于解析
    def parse(self,html):
        # 用正则表达式将美食店列表json字符串匹配出来
        pat = re.compile(r'"poiLists":(.+),"comHeader"')
        s = pat.findall(html)[0]
        # print(s)
        # 将json字符串转成字典
        jsondict = json.loads(s)
        foodlist = jsondict["poiInfos"]
        # print(foodlist)
        # 解析美食的内容
        for food in foodlist:
            item = {}
            item["poiId"] = food["poiId"]
            item["title"] = food["title"]
            item["avgScore"] = food["avgScore"]
            item["avgPrice"] = food["avgPrice"]
            item["address"] = food["address"]
            rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=13)
            print("当前线程为:%s,正在向redis数据库中存入数据:%s"%(self.name,item['title']))
            rds.lpush("foodlist",json.dumps(item))


if __name__ == '__main__':
    start_url = "https://bj.meituan.com/meishi/"
    # 定义一个队列,用于保存待爬取的页面的编号
    page_queue = Queue()
    for i in range(1,68):
        page_queue.put(i)
    # 定义一个队列,用于保存每次爬虫线程请求出来的网页源码
    data_queue = Queue()

    # 创建3个爬虫线程来开启并发的对67个页面进行抓取
    crawl_names = ["爬虫1","爬虫2","爬虫3"]
    for name in crawl_names:
        t = CrawlThread(start_url=start_url,page_queue=page_queue,data_queue=data_queue,name=name)
        t.start()
    # 创建3个解析线程
    parse_name = ["解析1","解析2","解析3"]
    for name in parse_name:

        t = ParseThread(data_queue=data_queue,name=name,page_queue=page_queue)
        t.start()

你可能感兴趣的:(爬虫与数据分析)