1、找到一家免费的高匿代理IP网站,解析网页结构获取有用的内容
def get_html(url, headers):
try:
r = requests.get(url, headers=headers, timeout= 10)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def get_proxy(html):
soup = BeautifulSoup(html)
proxy_list = soup.find_all('tr')
for i in range(len(proxy_list)-1):
i += 1
proxy = proxy_list[i].select('td:first-child')[0].string + ":" + proxy_list[i].select('td:nth-child(2)')[0].string
yield proxy
2、检查爬取到的代理IP是否有效。(这里用http://icanhazip.com/进行校验)
def check_proxy(url, ip):
try:
requests.adapters.DEFAULT_RETRIES = 3 #设置重连次数
r = requests. get(url,proxies={'http':ip}, timeout=7)
return r.text
except:
return ""
3、将有效IP写入文档。
def write_in(path, text):
with open(path, 'a') as f:
f.write(text)
f.write('\n')
f.close()
4、这里使用生产者消费者模型:
生产者爬取免费IP并放入队列
def put_queue(headers):
"""
获取IP放入队列
"""
for page in range(1, 500):
proxy_url = f"https://www.kuaidaili.com/free/inha/{page}/"
print(proxy_url)
proxies = get_proxy(get_html(proxy_url, headers))
sleep(randint(1, 3))
while True:
try:
message.put(next(proxies))
except StopIteration:
# print(f'stop:{i}')
break
消费者从队列中取出免费IP并验证其有效性
def get_queue(path):
check_url = "http://icanhazip.com/"
while True:
proxy = message.get()
# print(proxy)
if check_proxy(check_url, proxy):
print('有效ip', proxy)
write_in(path, proxy)
5、主函数(多线程启动)
def main():
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
path = 'proxies.txt'
truncatefile(path) # 爬取前清空文档
threads = []
t1 = threading.Thread(target=put_queue,args=(headers,))
t1.start()
for i in range(20):
t2 = threading.Thread(target=get_queue,args=(path,))
threads.append(t2)
for s in threads:
s.start()
if __name__ == '__main__':
message = queue.Queue(100)
main()