Scrapy结合MySQL,使用ORM:Peewee

Peewee的官方文档点这里。

首先,在items.py里建立Model和MySQL连接。

import scrapy
from peewee import *

db = MySQLDatabase("test",host='127.0.0.1',port=3306,user='test', passwd='test', charset='utf8')

class GoodsItem(scrapy.Item):
    # define the fields for your item here like:
    one = scrapy.Field()
    two = scrapy.Field()
    three = scrapy.Field()

class Goods(Model):
    # one 是主键
    one = CharField(verbose_name="one", max_length=100, primary_key=True, null=False)
    two = CharField(verbose_name="two", max_length=200, null=False)
    three = CharField(verbose_name="three",max_length=80, null=False) null=False)

    class Meta:
        database = db

接下来在spiders目录下新建一个爬虫脚本,写入代码 :

import scrapy
from ..items import GoodsItem
from bs4 import BeautifulSoup
import datetime

class mySpider(scrapy.Spider):
    name = 'test'
    def start_requests(self):
        url = 'https://xxxx.xxxl.com'
        yield scrapy.Request(url=ur, callback=self.parse)

    def parse(self, response):
        item = GoodsItem()
        item['one'] = 'xxx'
        item['two'] = 'xxx'
        item['three'] = 'xxx'
        yield item

然后在pipliens.py里执行数据库操作:

from .items import Goods
class MySQLStorePipeline(object):

    def process_item(self, item, spider):
        if Goods.table_exists() == False:
            Goods.create_table()
        try:
            Goods.create(one=item['one'],two=item['two'],three=item['three'])
        except Exception as e:
            if str(e.args[0]) == '1062':
                print ('重复数据,跳过。')
            else:
                print (e.args[0],e.args[1])

        return item

最后记得在settings.py里把item_pipeliens打开:

ITEM_PIPELINES = {
   'tmproject.pipelines.MySQLStorePipeline': 300,
}

到此大功告成!

你可能感兴趣的:(Scrapy结合MySQL,使用ORM:Peewee)