话不多说,直接开干
items.py
import scrapy
class DailiItem(scrapy.Item):
ip_ = scrapy.Field()
port_ = scrapy.Field()
type_ = scrapy.Field()
xici.py
爬取高匿ip 1-5页的内容,通过css选择器或xpath提取ip、端口、以及类型。以百度为验证网站对代理进行验证,验证有效后返回item,在pipelines里进一步处理。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DailiItem
import requests
class XiciSpider(CrawlSpider):
name = 'xici'
allowed_domains = ['xicidaili.com']
start_urls = ['http://www.xicidaili.com/nn/']
rules = (
Rule(LinkExtractor(allow=r'^http://www.xicidaili.com/nn/[2-5]?$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
# css
# html = response.css('#ip_list tr:not(:first-child)')
# ip_list = html.css('td:nth-child(2)::text').extract()
# port_list = html.css('td:nth-child(3)::text').extract()
# type_list = html.css('td:nth-child(6)::text').extract()
# xpath
ip_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[2]/text()').extract()
port_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[3]/text()').extract()
type_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[6]/text()').extract()
for (ip_, port_, type_) in zip(ip_list, port_list, type_list):
proxies = {type_ : ip_ + port_}
try:
if requests.get('http://www.baidu.com', proxies=proxies, timeout= 3).status_code == 200:
print("***Success:" + type_ + "://" + ip_ + ":" +port_ )
item = DailiItem()
item["ip_"] = ip_
item["port_"] = port_
item["type_"] = type_
yield item
except:
print("***Failure:" + type_ + "://" + ip_ + ":" + port_ )
pipelines.py
将验证过后返回的item数据储存至数据库
# -*- coding: utf-8 -*-
import pymysql
class DailiPipeline(object):
def process_item(self, item, spider):
conn = pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="daili")
ip = item["ip_"]
port = item["port_"]
type = item["type_"]
sql = "insert into proxy1(ip,port_,type) values ('"+ip+"','"+port+"','"+type+"')"
conn.query(sql)
conn.close()
return item
settings.py
开启USER_AGENT和 ITEM_PIPELINES
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
ITEM_PIPELINES = {
'daili.pipelines.DailiPipeline': 300,
}