这里面已经带上了绕过限制ip访问的防爬策略的方法了,自己找找,下一篇文章再重点说明。
# coding = utf-8
# Author = ChristopherLam
# Deadline = 2017-04-19
# qq = 770304694
# csdn = http://blog.csdn.net/christopher_l1n
from urllib import request, error
from bs4 import BeautifulSoup
from random import choice
import time
import threading
import os
import re
class FreeProxyIPSpider:
def __init__(self):
self.User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
self.header = {}
self.header['User-Agent'] = self.User_Agent
self.base_url = 'http://www.kuaidaili.com/free/inha/'
self.file_path = os.getcwd() + '/FreeProxyIP'
self.useable_file = os.getcwd() + '/UseableProxyIP'
self.headers = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'IE 9.0User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36']
self.regex = re.compile(r'baidu.com')
def struct_url(self, page):
"""构造代理ip集成网站的url"""
baseurl = self.base_url
url = baseurl + str(page) + '/'
return url
def global_spider(self, url=None, proxy_ip=None):
"""通用爬虫,若有url,即爬取代理集成网站的url
若有代理ip,即访问代理ip"""
if proxy_ip is None:
try:
req = request.Request(url, headers=self.RandomUserAgent())
res = request.urlopen(req).read()
soup = BeautifulSoup(res, 'html.parser')
IPs = soup.find_all(name='td', attrs={'data-title': 'IP'})
Ports = soup.find_all(name='td', attrs={'data-title': 'PORT'})
except error.URLError as e:
print(e)
IPs = None
Ports = None
return IPs, Ports
else:
opener_support = request.ProxyHandler({'http': proxy_ip})
opener = request.build_opener(opener_support)
request.install_opener(opener)
try:
rsp = request.urlopen('https://www.baidu.com', timeout=3)
except:
print('%s connect failed' % (proxy_ip))
return
else:
try:
string = rsp.read()
except:
print('%s connect failed' % (proxy_ip))
if self.regex.search(str(string)):
self.WriteUsableProxyIP(proxy_ip)
def WriteUsableProxyIP(self, proxy_ip):
"""可用代理ip写入文件"""
with open(self.useable_file, 'a', encoding='utf-8') as file:
file.write(proxy_ip)
file.close()
def RandomUserAgent(self):
"""随机header"""
header = {'User-Agent': choice(self.headers)}
return header
def PackagedSpider(self, page_num):
"""封装获取代理ip爬虫"""
url = self.struct_url(page_num)
IPs, Ports = self.global_spider(url)
if IPs is not None: # IPs和Ports要么同时有值,要么同时为None
with open(self.file_path, 'a', encoding='utf-8') as file:
for x in range(0, len(IPs)):
ip_td = IPs[x]
port_td = Ports[x]
for y in range(0, len(ip_td)):
file.write(ip_td.contents[y] + ':' + port_td.contents[y] + '\n')
print(ip_td.contents[y])
file.close()
def __readFreeProxyIPfile(self):
pass
def VerifyProxyIP(self):
"""
验证代理ip是否可用,若可用写入UseableProxyIP文件"""
with open(self.file_path, 'r', encoding='utf-8') as file:
while 1:
lines = file.readlines(10000)
if not lines:
break
else:
for i in lines:
self.global_spider(proxy_ip=i)
def main():
print('Author = ChristopherLam\nDeadline = 2017-04-19\nqq = 770304694\ncsdn = http://blog.csdn.net/christopher_l1n')
spi = FreeProxyIPSpider()
'''
这里是多线程爬取代理ip,但源网址的防爬策略是限制ip访问,下面的多线程是无法实现的,
还是老老实实单线程吧,或者你拿到可用代理ip后,用代理ip多线程爬取也是可以的,自己实现去。
定制利用代理ip多线程爬取代理ip,收费,联系qq:770304694。不过我可能也懒得做……
for i in range(1, 3):
spi_threading = threading.Thread(target=spi.PackagedSpider(i))
spi_threading.setDaemon(True)
spi_threading.start()
'''
spi.VerifyProxyIP()
if __name__ == '__main__':
main()