python scrapy 数据写入Mysql(pipeline)

1、items.py

-- coding: utf-8 --

import scrapy

class LearnscrapyItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()

2、settings.py

-- coding: utf-8 --

BOT_NAME = 'learnscrapy'

SPIDER_MODULES = ['learnscrapy.spiders']
NEWSPIDER_MODULE = 'learnscrapy.spiders'

ROBOTSTXT_OBEY = True
COOKIES_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
'learnscrapy.middlewares.USERAGENT': 1,
}

ITEM_PIPELINES = {
'learnscrapy.pipelines.LearnscrapyPipeline': 300,
}

3、middlewares.py

-- coding: utf-8 --

导入随机模块

import random

导入有关IP池有关的模块

from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware

导入有关用户代理有关的模块

from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

IP池

class HTTPPROXY(HttpProxyMiddleware):
# 初始化 注意一定是 ip=''
def init(self, ip=''):
self.ip = ip

def process_request(self, request, spider):
    item = random.choice(IPPOOL)
    try:
        print("当前的IP是:"+item["ipaddr"])
        request.meta["proxy"] = "http://"+item["ipaddr"]
    except Exception as e:
        print(e)
        pass

设置IP池

IPPOOL = [
{"ipaddr": "182.117.102.10:8118"},
{"ipaddr": "121.31.102.215:8123"},
{"ipaddr": "1222.94.128.49:8118"}
]

用户代理

class USERAGENT(UserAgentMiddleware):
#初始化 注意一定是 user_agent=''
def init(self, user_agent=''):
self.user_agent = user_agent

def process_request(self, request, spider):
    item = random.choice(UPPOOL)
    try:
        print("当前的User-Agent是:"+item)
        request.headers.setdefault('User-Agent', item)
    except Exception as e:
        print(e)
        pass

设置用户代理池

UPPOOL = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]
4、pipeline.py

-- coding: utf-8 --

import pymysql
import json
class LearnscrapyPipeline(object):
def init(self):
# 数据库连接
self.conn = pymysql.connect(host='192.168.126.181', user='wx', password='wx', database='test',
charset='utf8')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
for j in range(0, len(item["name"])):
nam = item["name"][j]
lin = item["link"][j]
sql = "insert into site(name,link) values(%s,%s)"
self.cur.execute(sql, (nam, lin))
self.conn.commit()
return item

def close_spider(self, spider):
    self.cur.close()
    self.conn.close()

5、spiders/test.py

-- coding: utf-8 --

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from learnscrapy.items import LearnscrapyItem

class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['sohu.com']
start_urls = ['http://www.sohu.com/']

rules = (
    Rule(LinkExtractor(allow=('http://news.sohu.com'), allow_domains=('sohu.com')), callback='parse_item',
         follow=False),
)

def parse_item(self, response):
    i = LearnscrapyItem()
    i['name'] = response.xpath('//div[@class="news"]/p/a/text()').extract()
    i['link'] = response.xpath('//div[@class="news"]/p/a/@href').extract()
    #i['description'] = response.xpath('//div[@id="description"]').extract()
    return i

6、main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(file)))
execute(['scarpy', 'crawl', 'test'])

你可能感兴趣的:(python scrapy 数据写入Mysql(pipeline))