Python 爬虫 —— 爬取 IP && 快速验证

知识共享许可协议
本作品采用知识共享署名 4.0 国际许可协议进行许可。

使用的是西刺免费代理IP的 IP 资源,参考了python爬虫-爬取代理IP并通过多线程快速验证

CODE:
获取 IP

import requests
from bs4 import BeautifulSoup

# get proxies
for page in range(1,50):
    url = 'http://www.xicidaili.com/nn/%s' %page
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"
    headers = {'user-agent': user_agent}

    r = requests.get(url, headers = headers)
    print 'opening %s\n' % url

    soup = BeautifulSoup(r.content, 'lxml')
    # print soup.prettify()

    trs = soup.find('table', id = 'ip_list').findAll('tr')
 #   print trs[1]

    for tr in trs[1:]:
        tds = tr.findAll('td')
        ip = tds[1].text.strip()
        port = tds[2].text.strip()
        with open('proxies.txt', 'a') as f:
            f.write('http://%s:%s\n' % (ip, port))
        print 'Adding http://%s:%s to table' % (ip, port)

验证

import re
import requests
import threading

srcfile = open('proxies.txt', 'r')
outfile = open('verified.txt', 'w')

url = 'http://pv.sohu.com/cityjson?ie=utf-8'
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"
headers = {'user-agent': user_agent}

mutex = threading.Lock()

def verify():
    try:
        mutex.acquire()
        proxy = srcfile.readline().strip()
        mutex.release()

        print proxy
        proxies = {'http': proxy}

        r = requests.get(url, proxies = proxies, headers = headers, timeout = 5)
        print r.content

        mutex.acquire()
        outfile.write('%s\n' %proxy)
        mutex.release()

    except requests.RequestException, e:
        print e


childthread = []

for i in range(4900):
    t = threading.Thread(target = verify)
    childthread.append(t)
    t.start()

for t in childthread:
    t.join()


srcfile.close()
outfile.close()

你可能感兴趣的:(Python 爬虫 —— 爬取 IP && 快速验证)