抓取西刺代理IP+验证是否可用+存储mongodb

spider文件的代码:

import scrapy

import requests #用于测试抓取过来的IP是否可用

class XiciSpider(scrapy.Spider):

name = "xici"

allowed_domains = ["xicidaili.com",]

def start_requests(self):

urls = ["http://www.xicidaili.com/nn/1/",

"http://www.xicidaili.com/nn/2",

      ]

for url in urls:

yield scrapy.Request(url,callback=self.parse)

def parse(self,response):

table = response.xpath("//table[@id='ip_list']")[0] #定位那个装满IP的大框

trs = table.xpath("//tr")[1:] #过滤掉第一行的标题栏  国家 IP地址 端口 服务器地址 是否匿名 类型 速度 连接时间 存活时间 验证时间

for tr in trs:

pagetest = "http://www.baidu.com.cn/" #用于测试的网页

ip = tr.xpath("td[2]/text()").extract()[0]

port = tr.xpath("td[3]/text()").extract()[0]

PROXY = "http://" + ip + ":" + port

proxies = {

"http":PROXY

}

try:

response = requests.get(pagetest,timeout=1,proxies=proxies)

print(response.status_code)

if response.status_code == 200: #判断返回的状态代码来判断IP是否可用

yield {

'ip':ip,

'port':port,

}

except:

print("connect failed!")

items.py代码:

import scrapy

class ScrapeipItem(scrapy.Item):

ip = scrapy.Field()

port = scrapy.Field()

pipeline.py代码: #存储在mongodb中,用于IP轮寻

import pymongo

import json

from scrapeIP import items

class ScrapeipPipeline(object):

def process_item(self, item, spider):

client = pymongo.MongoClient(host='127.0.0.1', port=12345)

db = client['ip']

coll = db['sheet']

if coll.find({'ip':item['ip']}).count() <= 1:    #用于判重,考虑代理IP的不稳定性,所以放宽条件了

coll.insert_one(item)

settings.py代码:

BOT_NAME = 'scrapeIP'

SPIDER_MODULES = ['scrapeIP.spiders']

NEWSPIDER_MODULE = 'scrapeIP.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'scrapeIP (+http://www.yourdomain.com)'

USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

COOKIES_ENABLED = False

AJAXCRAWL_ENABLED = True

ITEM_PIPELINES = {

    'scrapeIP.pipelines.ScrapeipPipeline':400,

}

你可能感兴趣的:(抓取西刺代理IP+验证是否可用+存储mongodb)