scrapy爬知乎日报--pipelines

当文章不存在时,保存到D:/知乎/latest, 当文件存在时,如果在latest中,就移动到past中

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pdfkit
import os
import shutil
class ZhihudailyPipeline(object):
    def process_item(self, item, spider):
        try:
            filename = os.path.basename(item["filename"])
            dirname = os.path.dirname(item["filename"])
            print('2')
            if not self.file_exsists(filename, dirname):
                print('*'*20)
                print(item["filename"], "downloding successfully")
                print('*'*20)
                print('\n')
                pdfkit.from_url(item["url"], dirname+r'/latest/'+filename)
            else:
                print('*'*20)
                print("文件已存在")
                print('*'*20)
                print('\n')
                for _r, _d, files in os.walk("D:/知乎日报/latest/"):
                    if filename in files:
                        shutil.move("D:/知乎日报/latest/"+filename, "D:/知乎日报/past/")
                        print("文件移到past\n")

        except:
            # 此处一个Exit with code 1 due to network error: ContentNotFoundError异常
            # 此异常为是因为css文件引用了外部的资源,如:字体,图片,iframe加载等。
            # 选择忽略此异常
            pass        

        return item

    def file_exsists(self, filename, dirname):
        for root, dirs, files in os.walk(dirname):
            if filename in files:
                print(filename, "exsisted\n")
                return True
        print(filename, "not exsist!\n")
        return False

但是这样有个BUG:如果这次爬知乎日报的文章不久被知乎从首页拿掉了,这篇文章就永远停留在latest中了


你可能感兴趣的:(python)