scrapy中pipeline数据去重和更新

class NewsEducationPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(
            host=settings.MYSQL_HOST,
            db=settings.MYSQL_DBNAME,
            user=settings.MYSQL_USER,
            passwd=settings.MYSQL_PASSWD,
            charset='utf8',
            use_unicode=True)
        self.cursor = self.connect.cursor()
    def process_item(self, item, spider):
        item["crawled"] = datetime.now()
        if item.__class__ == NewsEducationItem:
            crawled = item["crawled"]
            new_id = item["new_id"]
            title  = item["title"]
            url = item["url"]
            intro = item["intro"]
            img = item["img"]
            kl = item["kl"]
            time = item["time"]
            media = item["media"]
            source = item["source"]
            try:
                self.cursor.execute("""select * from new_info where url = %s""", url)
                ret = self.cursor.fetchone()
                if ret:
                    self.cursor.execute(
                        """update new_info set new_id = %s,title = %s,intro = %s,img = %s,
                            url = %s,kl = %s,time = %s,crawled =%s,media =%s,source =%s
                            where url  = %s""",
                        (new_id, title,intro,img,url, kl,time,crawled,media,source,url,))
                else:
                    self.cursor.execute(
                        """insert into new_info(new_id,title,intro,img,url,kl,time,crawled, media,source)
                          value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
                        (new_id,title,intro,img,url,kl,time,crawled, media,source,))
                    self.connect.commit()
            except Exception as error:
                print("错误")
            return item

 

你可能感兴趣的:(网络爬虫:Scrapy框架)