python网络爬虫实战——实时抓取西刺免费代理ip

参考网上高手示例程序,利用了多线程技术,Python版本为2.7

#-*-coding:utf8-*-

import urllib2
import re
import threading
import time

rawProxyList = []
checkedProxyList = []

#抓取代理网站
targets=[]
for i in range(1,6):
    target = r"http://www.xici.net.co/nn/%d" % i
    targets.append(target)
# print targets

#正则
p = re.compile(r''' 1:
                    checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
                else:
                    continue
            except Exception,e:
                continue

    def run(self):
        self.checkProxy()

if __name__ == "__main__":
    getThreads=[]
    checkThreads=[]

#对每个目标网站开启一个线程负责抓取代理
for i in range(len(targets)):
    t = ProxyGet(targets[i])
    getThreads.append(t)

for i in range(len(getThreads)):
    getThreads[i].start()

for i in range(len(getThreads)):
    getThreads[i].join()

print '.'*10+"总共抓取了%s个代理" %len(rawProxyList) +'.'*10

#开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
for i in range(20):
    t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
    checkThreads.append(t)

for i in range(len(checkThreads)):
    checkThreads[i].start()

for i in range(len(checkThreads)):
    checkThreads[i].join()

print '.'*10+"总共有%s个代理通过校验" %len(checkedProxyList) +'.'*10

#持久化
f= open("proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3])):
    print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
    f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3]))
f.close()
python网络爬虫实战——实时抓取西刺免费代理ip_第1张图片

你可能感兴趣的:(Python)