requests实现一个简易爬虫(随机ua,随即代理,限流器,内容去重)

爬取菜鸟教程python3相关教程
(http://www.runoob.com/python3/python3-tutorial.html)
抓取以下网页
requests实现一个简易爬虫(随机ua,随即代理,限流器,内容去重)_第1张图片

准备工作(所有用到的包)

import hashlib
import pickle
import queue
import random
import re
import zlib
from os import path
import os
from threading import Thread
from urllib import robotparser

import requests
import time
from datetime import datetime
from urllib.parse import urlparse, urlsplit, urljoin

from bson import Binary
from fake_useragent import UserAgent
from lxml import etree
from pymongo import MongoClient

from retrying import retry

随即代理(感谢快代理的免费代理提供 =3=)

class RandomProxy(object):
    def __init__(self):
        self.proxies = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
        }

    # 抓取代理
    def crawl_proxies(self):
        ua = UserAgent()
        headers = {
            "User-Agent": ua.random,  # 随机user-agent
        }
        base_url = "https://www.kuaidaili.com/free/inha/{}/"
        for i in range(1, 6):
            url = base_url.format(i)
            response = requests.get(url=url, headers=headers)
            tree = etree.HTML(response.text)
            ip_list = tree.xpath('//div[@id="list"]//tbody//tr')
            for proxy in ip_list:
                protocol = proxy.xpath('./td[4]/text()')[0]
                ip = proxy.xpath('./td[1]/text()')[0]
                port = proxy.xpath('./td[2]/text()')[0]
                ip_port = ip + ":" + port
                if self.verify_proxies({protocol: ip_port}):
                    self.proxies.append({protocol: ip_port})
                else:
                    continue
        # with open('proxies.json','w') as fp:
        #     fp.write(self.proxies)

    # 验证代理是否可用
    def verify_proxies(self, proxy):
        url = "http://www.baidu.com"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
        }
        response = requests.get(url=url, headers=headers, proxies=proxy)
        if response.status_code == 200:
            return True
        else:
            return False

    # 获得一个可用代理
    def get_proxy(self):
        if len(self.proxies) < 2:
            self.crawl_proxies()
        return random.choice(self.proxies)

下载限流器

class Throttle(object):
    def __init__(self, delay):
        self.client = MongoClient('localhost', 27017)
        self.db = self.client.cache
        domains = self.db.domain
        self.delay = delay

    def wait_url(self, url_str):
        domain_url = urlparse(url_str).netloc  # 取出url的域名部分
        last_accessed = self.db.domain.find_one({"_id": domain_url})['time']  # 从数据库取出域名的上次下载时间
        if self.delay > 0 and last_accessed is not None:
        	# 将当前时间和上次下载时间相减,得出两次下载时间间隔,然后用休眠时间(delay)减去这个间隔
            # 如果大于0就休眠,否则直接下载后续链接
            sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_interval > 0:
                time.sleep(sleep_interval)
        else:
            result = {'time': datetime.now()}
            self.db.domain.update_one({'_id': domain_url}, {'$set': result}, upsert=True)  # 把当前时间以域名作为key存到domains字典中

自己封装的工具类

class Help(object):
    def __init__(self):
        self.client = MongoClient('localhost', 27017)
        self.db = self.client.cache
        urls = self.db.urls

    # 解析robots文件
    def get_robots(self, url, ua):
        rp = robotparser.RobotFileParser()
        base_url = urlsplit(url).netloc
        base_url = 'http://' + base_url + '/robots.txt'
        rp.set_url(urljoin(base_url, 'robots.txt'))
        rp.read()
        return rp.can_fetch(ua, url)

    # 存储下载内容
    def save_url(self, html_content, url_str):
        filename = urlsplit(url_str).path.split("/")[-1]
        if not os.path.exists('./download/'):
            os.makedirs('./download/')
        filepath = path.join("./download/", filename)
        with open(filepath, 'wb') as fp:
            fp.write(html_content)

    # 提取网页中的相关链接
    def extractor_url_lists(self, html_content, keyword):
        # re.IGNORECASE忽略大小写
        url_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
        url_list = url_regex.findall(html_content.decode('utf8'))
        filter_urls = [link for link in url_list if re.search(keyword, link)]
        return set(filter_urls)

    # 以下载链接为key,内容的md5为值,进行去重(并存到mongodb)
    def distinct(self, url_str, html_content):
        content = hashlib.md5(str(html_content).encode(encoding='utf8')).hexdigest()
        record = self.db.urls.find_one({'_id': url_str})
        if record:
            before_temp = pickle.loads(zlib.decompress(record["html_content"]))
            before = hashlib.md5(str(before_temp).encode(encoding='utf8')).hexdigest()
            if before == content:
                return False
            else:
                return True
        else:
            result = {"html_content": Binary(zlib.compress(pickle.dumps(html_content))), "timestamp": datetime.utcnow()}  # 压缩数据,设置时间戳
            # 使用update的upsert(如果不存在执行insert,存在执行update)参数进行插入更新操作,$set内置函数表示覆盖原始数据
            self.db.urls.update({"_id": url_str}, {'$set': result}, upsert=True)
            return True

定义爬取深度

MAX_DEP = 2

通用爬虫

class CrawlerCommon(Thread):
    def __init__(self, init_url):
        super().__init__()
        self.throttle = Throttle(5.0)  # 实例化限流器
        self.random_proxy = RandomProxy()  #实例化随机代理
        self.help = Help()  # 实例化工具类
        __ua = UserAgent()  # 随机user-agent
        self.headers = {"User-Agent": __ua.random}
        self.seed_url = init_url  # 初始化爬取的种子网址
        self.crawler_queue = queue.Queue()  # 使用不同的队列会造成BFS和DFS的效果
        self.crawler_queue.put(init_url)  # 种子网址放入队列
        self.client = MongoClient('localhost', 27017)  # 创建mongodb连接
        self.visited = {init_url: 0}  # 初始化爬取深度为0

    # 使用装饰器的重试下载类
    @retry(stop_max_attempt_number=3)
    def retry_download(self, url_str, data, method, proxies):
        self.throttle.wait_url(url_str)
        if method == "POST":
            result = requests.post(url_str, data=data, headers=self.headers, proxies=proxies)
        else:
            result = requests.get(url_str, headers=self.headers, timeout=3, proxies=proxies)
        # 此处为断言,判断状态码是否为200
        assert result.status_code == 200
        return result.content

    # 真正的下载类
    def download(self, url_str, data=None, method="GET", proxies={}):
        print("download url is ::::", url_str)
        try:
            result = self.retry_download(url_str, data, method, proxies)
        except Exception as e:
            print(e)
            result = None
        return result

    # 进行网页爬取的主要方法
    def run(self):
        keyword = input("请输入关键字:")
        # print(self.crawler_queue.get())
        while not self.crawler_queue.empty():
            url_str = self.crawler_queue.get()
            if self.help.get_robots(url_str, self.headers["User-Agent"]):
                depth = self.visited[url_str]
                if depth < MAX_DEP:
                    proxy = self.random_proxy.get_proxy()
                    html_content = self.download(url_str, proxies=proxy)
                    if html_content is not None:
                        if self.help.distinct(url_str, html_content):  # 内容去重
                            self.help.save_url(html_content, url_str)  # 保存去重后的内容到文件
                    else:
                        continue
                    # keyword = input("请输入关键字:")
                    url_list = self.help.extractor_url_lists(html_content, keyword)  # 提取相关内容的链接即包含python3的链接

                    for url in url_list:
                        if "http" in url:
                            pass
                        else:
                            url = urljoin("http://www.runoob.com", url)  # 补全链接
                        if url not in self.visited:
                            self.visited[url] = depth + 1
                            self.crawler_queue.put(url)

            else:
                print("robots.txt 禁止下载:", url_str)

运行

if __name__ == "__main__":
    crawler = CrawlerCommon("http://www.runoob.com/python3/python3-tutorial.html")
    crawler.start()

你可能感兴趣的:(python爬虫)