爬取西刺代理
生成请求头
#encoding = utf-8; __all__ = ("Header"); import random; class Header(object): '''请求头构造类''' def __init__(self): self.__user_agent = [ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)", #IE "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", # Fire_Fox "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", # Chrome "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", # taobao "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", #猎豹 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", # 360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", # safarir "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", # 搜狐 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ", # maxthon "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" # uc ]; @property def headers(self): '''返回一个伪造后的hander''' headers = { "User-agent" : self.user_agent, }; return headers; @property def user_agent(self): index = random.randint(0, len(self.__user_agent)-1); return self.__user_agent[index]; def __new__(cls): '''此类创建模式为单实例模式''' if not hasattr(cls, "__instance"): cls.__instance = super().__new__(cls); return cls.__instance; else: return cls.__instance;
ip 模块类
#encoding = utf-8;
__all__ = ("IP_Model", "IP_List");
class IP_Model(object):
'''保存代理ip的全部内容'''
def __init__(self):
self._country = None;
self._addres = None;
@property
def country(self):
'''
代理服务器所在国家
'''
return self._country;
@country.setter
def country(self, ip_country):
if ip_country != None:
self._country = ip_country;
else:
self._country = None;
@property
def ip(self):
'''
代理服务器的ip
'''
return self._ip;
@ip.setter
def ip(self, new_ip):
self._ip = new_ip;
@property
def port(self):
'''
访问端口号
'''
return self._port;
@port.setter
def port(self, new_port):
self._port = new_port;
@property
def addres(self):
'''
服务器所在省地址
'''
return self._addres;
@addres.setter
def addres(self, new_addres):
if new_addres != None:
self._addres = new_addres;
else:
self._addres = None;
@property
def http_type(self):
'''
请求类型
'''
return self._http_type;
@http_type.setter
def http_type(self, type):
self._http_type = type;
@property
def velocity(self):
'''服务器速度'''
return self._velocity;
@velocity.setter
def velocity(self, http_velocity):
self._velocity = http_velocity;
@property
def anonymous(self):
return self._anonymous;
@anonymous.setter
def anonymous(self, anonymous_text):
if anonymous_text == "高匿":
self._anonymous = True;
else:
self._anonymous= False;
def __str__(self):
'''
重新__str__方法,
:return: 返回格式化的IP_Model属性内容生成的字符串
'''
return (
"| country: {} |\n"
"| ip: {} |\n"
"| port: {} |\n"
"| address: {} |\n"
"| http_type: {} |\n"
"| velocity: {}|\n"
.format(self.country, self.ip, self.port, self.addres, self.http_type, self.velocity)
);
def to_dict(self):
return {
"country" : self.country,
"ip" : self.ip,
"port" : self.port,
"addres" : self.addres,
"http_type" : self.http_type,
"velocity" : self.velocity
};
def from_dict(self,dict):
self.country = dict.get("country");
self.ip = dict.get("ip");
self.port = dict("port");
self.addres = dict.get("addres");
self.http_type = dict.get("http_type");
self.velocity = dict.get("velocity");
def get_ip_proxies(self):
proxies = None;
if self.http_type == "https":
proxies = { "https" : "{}:{}".format(self.ip, self.port)};
else:
proxies = {"http": "{}:{}".format(self.ip, self.port)};
return proxies;
class IP_List(object):
def __init__(self):
self.http_list = None;
self.https_list = None;
保存到csv
#encoding = utf-8 import pandas; ''' 供simple_proxy使用的保存数据函数集 ''' def to_pandas_DataFrame(ips_list): ''' 适配pandas 的数据类型, 将list表转换为pandas存储的数据类型 :param page_list: :return: 返回panfas存储数据的类型 ''' page_map = map(lambda ip_model: ip_model.to_dict(), ips_list); return pandas.DataFrame(list(page_map)); def to_csv(dicts): to_pandas_DataFrame(dicts).to_csv("./ips_info.csv", mode="a", encoding="ANSI"); def read_csv(path, start, step): ''' 从csv的指定行开始读取对应行数的ip内容 :param path: csv文件路径名 :param start: 开始行 :param step: 每次读取的行数 :return: 返回对应的ip_list ''' pass;
爬取西刺主体代码
# encoding = utf-8 __all__ = ("html_to_dom", "ProxyIPWorm"); import requests; from header import Header; from bs4 import BeautifulSoup; from ip_model import IP_Model, IP_List; import save; import time; import re; def simple_proxy(read_out): ''' 简单代理ip构建 :param read_out: :return: ''' pass; def html_to_dom(url, header, proxies=None): ''' 简单封装下requests :param url: 访问url :param header: 伪造的请求头 :param proxies: 是否使用代理ip :return: ''' if proxies != None: response = requests.get(url, headers=header, proxies=proxies, verify=True); else: response = requests.get(url, headers=header, verify=True); if response.status_code == 200: response.encoding = "utf-8"; return BeautifulSoup(response.text, "html.parser"); else: return None; def proxy(url, ips, log): ''' 使用代理ip访问指定服务器 :param url: 访问的服务器ip路径 :param ips: 携带http_list和https_list的服务器ip列表 :param log: 是否开启日志 :return: 返回生成的bs4的dom ''' type = re.match(r"(.*):.*", url).group(1); if ips == None: raise RuntimeError("代理列表为空"); ip_list = None; if type == "http": ip_list = ips.http_list; elif type == "https": ip_list = ips.https_list; else: raise RuntimeError("不支持此类请求"); if log == True: print("请求类型{}\n".format(type)); for ip in ip_list: proxies = {type : "{}:{}".format(ip.ip, ip.port)}; print(proxies); dom = html_to_dom(url, Header().headers, proxies); if log == True: print("当前ip:\n{}\n".format(ip)); if dom != None: return dom; class ProxyIPWorm(object): '''爬取代理ip''' def __init__(self): self.proxy_ip_html = "https://www.xicidaili.com/nn/"; self.dom_tree = html_to_dom(self.proxy_ip_html, Header().headers); @property def start_page(self): ''' 开始页 :return:永远返回1 ''' return 1; @property def end_page(self): ''' 获取公开的高匿ip的总页数 :return: 返回高匿ip页数 ''' page_dom = self.dom_tree.select(".pagination a"); self._end_page = page_dom[-2]; return int(self._end_page.text); def page_url(self, type, page): ''' 由给定整数生成对应西刺ip对应的页数的网址 :param page: 指定的页数 :return: 生成后的网址 ''' if page < 1 or page > self.end_page: raise RuntimeError("页数大于总页数"); elif page == 1: return "https://www.xicidaili.com/{}/".format(self.http_type(type)); else: return "https://www.xicidaili.com/{}/{}".format(self.http_type(type) ,page); def http_type(self, type): ''' 根据http或https返回对应的西刺代理格式 :param type: hhtp 或 https :return: 对应的西刺代理格式 ''' if type == "http": return "wt"; elif type == "https": return "wn"; else: raise RuntimeError("type应该为http或https"); def get_page_ips(self, type, page): ''' 获取指定页的所有ip :param type: ip类型 http 或 https :param page: 爬取页面 :return:返回该页被ip_model封装的所有ip列表 ''' print(self.page_url(type ,page)); page_dom = html_to_dom(self.page_url(type ,page), Header().headers); page_ips_dom = page_dom.select("table tr"); # print(page_ips_dom[0]); ip_generator = (ip for ip in page_ips_dom[1:]); ip_list = []; for ip_dom in ip_generator: ip_info = self.get_ip_info(ip_dom); ip_list.append(ip_info); return ip_list; def get_ip_info(self, ip_dom): ''' 获取指定的ip详细信息 :param ip_dom: 存有ip信息的html节点 :return: 返回ip_model结构的ipo封装类 ''' ip_info = IP_Model(); ip_td = ip_dom.select("td"); country = ip_td[0].img; ip_info.http_type = ip_td[5].text; if country != None: ip_info.country = str(country.get("alt")); ip_info.addres = ip_td[3].text.split()[0]; ip_info.ip = ip_td[1].text; ip_info.port = ip_td[2].text; ip_info.anonymous = ip_td[4].text; ip_info.velocity = ip_td[6].div.get("title"); return ip_info; def get_pages_ips(self, type, start_page, end_page, save_in=save.to_csv): ''' 获取指定开始页到结束页的所有ip(包括结束页) :param type: 请求为http还是https :param start_page: 开始页面 :param end_page: 结束页 :param save_in: 如何保存到文件格式,是一个回调函数,默认保存入csv :return: ''' if start_page >= end_page: raise RuntimeError("开始页大于等于结束页"); elif start_page < 1: raise RuntimeError("开始页小于结束页"); elif end_page > self.end_page: raise RuntimeError("结束页大于总页数"); else: for page in range(start_page, end_page): print("当前页:{}".format(page)); page_list = self.get_page_ips(page); save_in(page_list); time.sleep(10); return page_list;
测试代码
if __name__ == "__main__": test = ProxyIPWorm(); # https_list = test.get_page_ips("https", 1); http_list = test.get_page_ips("http", 1); ips = IP_List(); ips.https_list = https_list; ips.http_list = http_list; dom = proxy("http://news.gzcc.cn/html/xiaoyuanxinwen/", ips, True); print(dom);