多线程爬取免费代理IP

思路:

1、找到一家免费的高匿代理IP网站,解析网页结构获取有用的内容

def get_html(url, headers):
	try:
		r = requests.get(url, headers=headers, timeout= 10)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		return ""


def get_proxy(html):
	soup = BeautifulSoup(html)
	proxy_list = soup.find_all('tr')
	for i in range(len(proxy_list)-1):
		i += 1
		proxy = proxy_list[i].select('td:first-child')[0].string + ":" + proxy_list[i].select('td:nth-child(2)')[0].string
		yield proxy

2、检查爬取到的代理IP是否有效。(这里用http://icanhazip.com/进行校验)

def check_proxy(url, ip):
	try:
		requests.adapters.DEFAULT_RETRIES = 3 #设置重连次数
		r = requests. get(url,proxies={'http':ip}, timeout=7)
		return r.text
	except:
		return ""

3、将有效IP写入文档。

def write_in(path, text):
	with open(path, 'a') as f:
		f.write(text)
		f.write('\n')
		f.close()

4、这里使用生产者消费者模型:

生产者爬取免费IP并放入队列

def put_queue(headers):
	"""
	获取IP放入队列
	"""
	for page in range(1, 500):
		proxy_url = f"https://www.kuaidaili.com/free/inha/{page}/"
		print(proxy_url)
		proxies = get_proxy(get_html(proxy_url, headers))
		sleep(randint(1, 3))
		while True:
			try:
				message.put(next(proxies))
			except StopIteration:
				# print(f'stop:{i}')
				break

消费者从队列中取出免费IP并验证其有效性

def get_queue(path):
	check_url = "http://icanhazip.com/"
	while True:
		proxy = message.get()
		# print(proxy)
		if check_proxy(check_url, proxy):
			print('有效ip', proxy)
			write_in(path, proxy)

5、主函数(多线程启动)

def main():
	headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
	path = 'proxies.txt'
	truncatefile(path)  # 爬取前清空文档
	threads = []

	t1 = threading.Thread(target=put_queue,args=(headers,))
	t1.start()

	for i in range(20):
		t2 = threading.Thread(target=get_queue,args=(path,))
		threads.append(t2)

	for s in threads:
		s.start()


if __name__ == '__main__':
	message = queue.Queue(100)
	main()

 

你可能感兴趣的:(爬虫,python,proxy,queue)