python 爬虫scrapy框架 自建IP代理池

1、http://www.xicidaili.com/wt 国内免费代理网站

2、利用scrapy爬取该网站内的IP地址与端口,写入txt文档

3、编写脚本测试txt文档中的ip地址与端口是否可用

4、将可用ip地址与端口输入txt文档

————————————————————————
1、编写Item类
由于我们只需要ip地址与端口,所以只写一个属性即可

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class IpItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class IpInfoItem(scrapy.Item):
    ip=scrapy.Field()

2、编写spider

# -*- coding: utf-8 -*-
import scrapy
import sys
sys.path.append("D:\\pycodes\\ip")
from ip.items import IpInfoItem

class IpSpider(scrapy.Spider):
    name = 'Ip'

    start_urls = []
    #爬取5页网站的IP
    for i in range(1,6):
        start_urls.append('http://www.xicidaili.com/wt/'+str(i))

    def parse(self, response):

        item = IpInfoItem()

        for sel in response.xpath('//tr'):
            ip= sel.xpath('.//td[2]/text()').extract_first()
            port=sel.xpath('.//td[3]/text()').extract_first()
            item['ip']=str(ip)+":"+str(port)

            yield item

3、编写pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class IpPipeline(object):
    def process_item(self, item, spider):
        return item


class IpInfoPipeline(object):


    def process_item(self,item,spider):
        try:
        #我们只需要IP地址与端口,因此只把字典值写进txt文件
            content = item['ip']
            open("xinresult.txt","a").write(content+"\n")

        except:
            pass
        return item

至此,我们从网站上爬下来了5页的IP,需要编写脚本进行测试

import requests

alive_ip=[]

def test_alive(proxy):

    global alive_ip
    for proxies_be in proxy:
        #request中的IP地址需要以下列形式的参数写进函数
        proxies={"http":proxies_be}

        print("正在测试:{}".format(proxies))
        try:
            r = requests.get("http://www.baidu.com",proxies=proxies,timeout=2)
            if r.status_code==200:
                print("成功,ip为{}".format(proxies))
                alive_ip.append(proxies_be)
            else:
                print("失败")
        except:
            print("失败")


def out_file(alive_ip=[]):

    with open ("alive_ip.txt","w") as f:
        for ip in alive_ip:
            f.write(str(ip)+"\n")
        print("输出完毕")

def test(filename="blank.txt"):

    with open(filename,"r") as f:
        lines = f.readlines()

        proxys=list(map(lambda x:x.strip(),lines))

        test_alive(proxys)

    out_file(alive_ip)


test("xinresult.txt")

你可能感兴趣的:(python 爬虫scrapy框架 自建IP代理池)