话不多说,代码呈上:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/2/26 13:02
# @Author : One Fine
__author__ = "One Fine"
# @Site :
# @File : crawl_xici_ip.py
# @Software: PyCharm
import MySQLdb
import requests
from scrapy.selector import Selector
connect = MySQLdb.connect(host='127.0.0.1', user='root963', passwd='963123',
db='spider_test', charset='utf8')
class GetIP(object):
def judeg_ip(self, ip=None, port=None):
proxy = {}
if ip:
proxy = {
'http': "{0}:{1}".format(ip, port) # 或者加上http://
}
print(proxy["http"])
test_http_url = "http://www.test.onefine.top/ip"
try:
response = requests.get(test_http_url, proxies=proxy, timeout=(0.8, 2))
except Exception as e:
print("jugeg_ip exception: ", e)
self.delete_ip(ip)
return False
print(">>>status_code: ", response.status_code)
print(">>>text: ", response.text)
if 200 <= response.status_code < 300:
print(str(response.status_code)+", ip可以用!\n")
return True
else:
self.delete_ip(ip) # 将此ip从数据库中删除
return False
def delete_ip(self, ip=None):
if not ip: # ip=None
return True
# 从数据库中删除无效的ip
delete_sql = "delete from `ip_proxy_pool` where ip='{0}'".format(ip)
try:
cursor = connect.cursor()
cursor.execute(delete_sql)
connect.commit()
print("已移除IP: %s ." % ip)
return True
except Exception as e:
print("delete_ip exception: ", e)
print("IP: %s 移除失败." % ip)
cursor.close()
def get_random_ip(self):
print("正在获取ip,请稍后...")
random_sql = "select ip, port from ip_proxy_pool order by rand() limit 1;"
cursor = connect.cursor()
results = cursor.execute(random_sql)
if results == 0:
print("ip获取失败:数据库为空!")
return False
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
if self.judeg_ip(ip, port):
print("可用代理:", ip+':'+port)
return ip, port
else:
return self.get_random_ip()
cursor.close()
def get_ip():
try:
ip, port = GetIP().get_random_ip()
print("成功获取:", ip+':'+port)
return ip+':'+port
except Exception as e:
print('exception: ', e)
print('获取失败!')
return
proxy = {
'http': get_ip()
}
def crawl_ips():
# 爬取某网站的国内高匿代理IP
url = 'https://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.132 Safari/537.36',
}
res = requests.get(url, headers=headers, proxies=proxy)
if not res.status_code == 200:
print('status_code: ', res.status_code)
return
selector = Selector(text=res.text)
page_number = selector.xpath('//a[last()-1]/text()').extract_first() # 获取总页数
page_numbers = int(selector.xpath('//a[last()-1]/text()').extract_first())
for i in range(1, page_numbers+1):
# if i == 1:
# response = res
response = requests.get(url+str(i), headers=headers, proxies=proxy)
selector = Selector(text=response.text)
# print(response.text)
all_trs = selector.xpath('//*[@id="ip_list"]//tr')
ip_list = []
for tr in all_trs[1:]:
try:
country = tr.xpath('td[1]/img/@alt').extract()[0] # 国家
server_address = tr.xpath('td[4]/a/text()').extract()[0] # 服务器地址
except Exception as e:
print("crawl_ips exception 1: ", e)
# continue
country = server_address = None
ip = tr.xpath('td[2]/text()').extract()[0] # IP地址
port = tr.xpath('td[3]/text()').extract()[0] # 端口
anonymity = tr.xpath('td[5]/text()').extract()[0] # 是否匿名
type = tr.xpath('td[6]/text()').extract()[0] # 类型
speed = tr.xpath('td[7]/div/@title').extract()[0] # 速度
connection_time = tr.xpath('td[8]/div/@title').extract()[0] # 连接时间
survival_time = tr.xpath('td[9]/text()').extract()[0] # 存活时间
verify_time = tr.xpath('td[10]/text()').extract()[0] # 验证时间
ip_list.append((country, ip, port, server_address, anonymity, type, speed,
connection_time, survival_time, verify_time))
# 存入数据库
for ip_info in ip_list:
import datetime
cursor = connect.cursor()
speed = float(ip_info[6].split('秒')[0])
connection_time = float(ip_info[7].split('秒')[0])
verify_time = datetime.datetime.strptime("20"+ip_info[9]+":00", "%Y-%m-%d %H:%M:%S")
# 注意传递值的时候字符串需要引号
sql = "INSERT INTO `ip_proxy_pool` VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6}, " \
"{7}, '{8}', '{9}');".format(ip_info[0], ip_info[1], ip_info[2], ip_info[3], ip_info[4],
ip_info[5], speed, connection_time, ip_info[8], verify_time)
print('sql', sql)
try:
cursor.execute(sql)
connect.commit()
except Exception as e:
print('insert exception: ', e)
if __name__ == '__main__':
crawl_ips()