spider文件的代码:
import scrapy
import requests #用于测试抓取过来的IP是否可用
class XiciSpider(scrapy.Spider):
name = "xici"
allowed_domains = ["xicidaili.com",]
def start_requests(self):
urls = ["http://www.xicidaili.com/nn/1/",
"http://www.xicidaili.com/nn/2",
]
for url in urls:
yield scrapy.Request(url,callback=self.parse)
def parse(self,response):
table = response.xpath("//table[@id='ip_list']")[0] #定位那个装满IP的大框
trs = table.xpath("//tr")[1:] #过滤掉第一行的标题栏 国家 IP地址 端口 服务器地址 是否匿名 类型 速度 连接时间 存活时间 验证时间
for tr in trs:
pagetest = "http://www.baidu.com.cn/" #用于测试的网页
ip = tr.xpath("td[2]/text()").extract()[0]
port = tr.xpath("td[3]/text()").extract()[0]
PROXY = "http://" + ip + ":" + port
proxies = {
"http":PROXY
}
try:
response = requests.get(pagetest,timeout=1,proxies=proxies)
print(response.status_code)
if response.status_code == 200: #判断返回的状态代码来判断IP是否可用
yield {
'ip':ip,
'port':port,
}
except:
print("connect failed!")
items.py代码:
import scrapy
class ScrapeipItem(scrapy.Item):
ip = scrapy.Field()
port = scrapy.Field()
pipeline.py代码: #存储在mongodb中,用于IP轮寻
import pymongo
import json
from scrapeIP import items
class ScrapeipPipeline(object):
def process_item(self, item, spider):
client = pymongo.MongoClient(host='127.0.0.1', port=12345)
db = client['ip']
coll = db['sheet']
if coll.find({'ip':item['ip']}).count() <= 1: #用于判重,考虑代理IP的不稳定性,所以放宽条件了
coll.insert_one(item)
settings.py代码:
BOT_NAME = 'scrapeIP'
SPIDER_MODULES = ['scrapeIP.spiders']
NEWSPIDER_MODULE = 'scrapeIP.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapeIP (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
COOKIES_ENABLED = False
AJAXCRAWL_ENABLED = True
ITEM_PIPELINES = {
'scrapeIP.pipelines.ScrapeipPipeline':400,
}