一开始爬取xicidaili,频率太快ip被禁了。。。,只能回家爬取。明天把爬取的ip存到redis里做持久化,并且爬取后自动测试代理评级。
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request,Spider
from pyquery import PyQuery
from ..items import ProxyItem
class ProxySpider(Spider):
name = 'proxy'
allowed_domains = ['proxy']
start_urls = ['http://proxy/']
page = 1
xicidail_url = "http://www.xicidaili.com/nn/{page}"
kuaidaili_url = "https://www.kuaidaili.com/free/inha/{page}/"
_66daili_url = "http://www.66ip.cn/areaindex_{page}/1.html"
ip3366_url = "http://www.ip3366.net/?stype=1&page={page}"
def start_requests(self):
yield Request(url=self.kuaidaili_url.format(page=self.page), callback=self.kuaidaili_parse)
yield Request(url=self._66daili_url.format(page=self.page), callback=self._66_daili_parse)
yield Request(url=self.ip3366_url.format(page=self.page), callback=self.ip3366_parse)
# yield Request(url=self.xicidail_url.format(page=1), callback=self.xicidaili_parse)
def kuaidaili_parse(self, response):
pq = PyQuery(response.text)
item = ProxyItem()
proxies = pq.find("#list .table-bordered tbody").find("tr")
for proxy in proxies.items():
ip = proxy.find("td").eq(0).text()
port = proxy.find("td").eq(1).text()
item["proxy"] = ip + ":" + port
print("从%s成功获取代理:IP:%s PORT:%s" % ("www.kuaidaili.com", ip, port))
yield item
now_page = int(response.url.split("/")[-2])
next_page = now_page + 1
if next_page <= 10:
yield Request(url=self.kuaidaili_url.format(page=str(next_page)), callback=self.kuaidaili_parse, dont_filter=True)
def _66_daili_parse(self, response):
pq = PyQuery(response.text)
item = ProxyItem()
proxies = pq.find("#footer table tr:gt(0)")
for proxy in proxies.items():
ip = proxy.find("td").eq(0).text()
port = proxy.find("td").eq(1).text()
item["proxy"] = ip + ":" + port
print("从%s成功获取代理:IP:%s PORT:%s" % ("http://www.66ip.cn", ip, port))
yield item
now_page = int(response.url.split("/")[-2].split("_")[1])
next_page = now_page + 1
if next_page <= 34:
yield Request(url=self._66daili_url.format(page=str(next_page)), callback=self._66_daili_parse, dont_filter=True)
def ip3366_parse(self, response):
pq = PyQuery(response.text)
item = ProxyItem()
proxyies = pq.find("#list table tbody tr:gt(0)")
for proxy in proxyies.items():
ip = proxy.find("td").eq(0).text()
port = proxy.find("td").eq(1).text()
item["proxy"] = ip + ":" + port
print("从%s成功获取代理:IP:%s PORT:%s" % ("www.ip3366.net", ip, port))
yield item
now_page = int(response.url.split("=")[2])
next_page = now_page + 1
if next_page <= 10:
yield Request(url=self.ip3366_url.format(page=str(next_page)), callback=self.ip3366_parse, dont_filter=True)
# def xicidaili_parse(self, response):
# print(response)
# page = int(response.url.split("/")[-1])
# pq = PyQuery(response.text)
# item = XicidailiItem()
# proxies = pq.find("#ip_list").find("tr")
# total_page = int(pq.find("#body .pagination a").eq(-2).text())
# self.page += 1
# for proxy in proxies.items():
# ip = proxy.find("td").eq(1).text()
# port = proxy.find("td").eq(2).text()
# item["proxy_info"] = ip + ":"+ port
# # print(item["proxy_info"])
# yield item
# page += 1
# if page <= 10:
# yield Request(url=self.xicidail_url.format(page=str(page)), callback=self.xicidaili_parse, dont_filter=True)
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Item,Field
class ProxyItem(Item):
ip = Field()
port = Field()
# -*- coding: utf-8 -*-
# Scrapy settings for ProxyPool project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ProxyPool'
SPIDER_MODULES = ['ProxyPool.spiders']
NEWSPIDER_MODULE = 'ProxyPool.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ProxyPool (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 4
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'ProxyPool.middlewares.ProxypoolSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'ProxyPool.middlewares.ProxypoolDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'ProxyPool.pipelines.ProxyPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 30
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'