Python3.x - Scrapy - MySQL 西刺代理

话不多说,直接开干

items.py

import scrapy

class DailiItem(scrapy.Item):
    ip_ = scrapy.Field()
    port_ = scrapy.Field()
    type_ = scrapy.Field()

xici.py
爬取高匿ip 1-5页的内容,通过css选择器或xpath提取ip、端口、以及类型。以百度为验证网站对代理进行验证,验证有效后返回item,在pipelines里进一步处理。

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DailiItem
import requests


class XiciSpider(CrawlSpider):
    name = 'xici'
    allowed_domains = ['xicidaili.com']
    start_urls = ['http://www.xicidaili.com/nn/']
    rules = (
        Rule(LinkExtractor(allow=r'^http://www.xicidaili.com/nn/[2-5]?$'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):

        # css
        # html = response.css('#ip_list tr:not(:first-child)')
        # ip_list = html.css('td:nth-child(2)::text').extract()
        # port_list = html.css('td:nth-child(3)::text').extract()
        # type_list = html.css('td:nth-child(6)::text').extract()

        # xpath
        ip_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[2]/text()').extract()
        port_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[3]/text()').extract()
        type_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[6]/text()').extract()

        for (ip_, port_, type_) in zip(ip_list, port_list, type_list):
            proxies = {type_ : ip_ + port_}
            try:
                if requests.get('http://www.baidu.com', proxies=proxies, timeout= 3).status_code == 200:
                    print("***Success:" + type_ + "://" + ip_ + ":" +port_ )
                    item = DailiItem()
                    item["ip_"] = ip_
                    item["port_"] = port_
                    item["type_"] = type_
                    yield item
            except:
                print("***Failure:" + type_ + "://" + ip_ + ":" + port_ )

pipelines.py
将验证过后返回的item数据储存至数据库

# -*- coding: utf-8 -*-

import pymysql

class DailiPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="daili")
        ip = item["ip_"]
        port = item["port_"]
        type = item["type_"]
        sql = "insert into proxy1(ip,port_,type) values ('"+ip+"','"+port+"','"+type+"')"
        conn.query(sql)
        conn.close()
        return item

settings.py
开启USER_AGENT和 ITEM_PIPELINES

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'

ITEM_PIPELINES = {
    'daili.pipelines.DailiPipeline': 300,
}

你可能感兴趣的:(Python)