PHP使用封装的curl解决跨域问题,爬虫无视任何限制

一、封装了一个curl_http.php类,代码如下:

set_user_agent($useragent);
 * $curl->store_cookies("/tmp/cookies.txt");
 * $post_data = array('login' => 'pera', 'password' => 'joe');
 * $html_data = $curl->send_post_data(http:/域名/login.php, $post_data);
 */
class Curl_HTTP_Client
{
	var $ch ;
	var $debug = true;
	var $error_msg;
	var $error_no="";
	function __construct($debug = false)
	{
		$this->debug = $debug;
		$this->init();
	}
	function init()
	{
		//函数的作用初始化一个curl会话,curl_init()函数唯一的一个参数是可选的,表示一个url地址。
		$this->ch = curl_init();
		//set various options
		//set error in case http return code bigger than 300
		//显示HTTP状态码,默认行为是忽略编号小于等于400的HTTP信息
		curl_setopt($this->ch, CURLOPT_FAILONERROR, true);
		// 允许重新定向
		curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, 0);
		// 如果有gzip则解压
		curl_setopt($this->ch,CURLOPT_ENCODING , 'gzip, deflate');
		// do not veryfy ssl
		// this is important for windows
		// as well for being able to access pages with non valid cert
		//curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, 0);
		//不验证证书下同
		curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, false); 
		curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, false); 
	}
	//HTTP认证
	function set_credentials($username,$password)
	{
		curl_setopt($this->ch, CURLOPT_USERPWD, "$username:$password");
	}
	//来源位置
	function set_referrer($referrer_url)
	{
		//设置header中"Referer: " 部分的值
		curl_setopt($this->ch, CURLOPT_REFERER, $referrer_url);
	}
	function set_user_agent($useragent)
	{
		//在HTTP请求中包含一个”user-agent”头的字符串
		curl_setopt($this->ch, CURLOPT_USERAGENT, $useragent);
	}
	function include_response_headers($value)
	{
		//启用时会将头文件的信息作为数据流输出
		curl_setopt($this->ch, CURLOPT_HEADER, $value);
	}
	function set_proxy($proxy)
	{
		//设置通过的HTTP代理服务器
		curl_setopt($this->ch, CURLOPT_PROXY, $proxy);
	}
	function send_post_data($url, $postdata, $ip=null, $timeout=10)
	{
		//需要获取的URL地址
		curl_setopt($this->ch, CURLOPT_URL,$url);
		//在启用CURLOPT_RETURNTRANSFER时候将获取数据返回
		curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,true);
		//绑定固定IP
		if($ip)
		{
			if($this->debug)
			{
				echo "Binding to ip $ip\n";
			}
			curl_setopt($this->ch,CURLOPT_INTERFACE,$ip);
		}
		//设置curl允许执行的最长秒数 $timeout
		curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout);
		//启用时会发送一个常规的POST请求,类型为:application/x-www-form-urlencoded,就像表单提交的一样。
		curl_setopt($this->ch, CURLOPT_POST, true);
		//generate post string
		$post_array = array();
		if(is_array($postdata))
		{
			foreach($postdata as $key=>$value)
			{
				//$post_array[] = urlencode($key) . "=" . urlencode($value);
				$post_array[] = $key . "=" . $value;
			}

			$post_string = implode("&",$post_array);

			if($this->debug)
			{
				echo "Url: $url\nPost String: $post_string\n";
			}
		}
		else
		{
			$post_string = $postdata;
		}

		//在HTTP中的“POST”操作。如果要传送一个文件,需要一个@开头的文件名
		curl_setopt($this->ch, CURLOPT_POSTFIELDS, $post_string);


		//执行一个curl会话
		$result = curl_exec($this->ch);

		if(curl_errno($this->ch))
		{
			if($this->debug)
			{
				echo "Error Occured in Curl\n";
				echo "Error number: " .curl_errno($this->ch) ."\n";
				echo "Error message: " .curl_error($this->ch)."\n";
			}

			return false;
		}
		else
		{
			return $result;
		}
	}
	function fetch_url($url, $ip=null, $timeout=20)
	{
		//需要获取的URL地址,也可以在PHP的curl_init()函数中设置
		curl_setopt($this->ch, CURLOPT_URL,$url);

		//启用时会设置HTTP的method为GET,因为GET是默认是,所以只在被修改的情况下使用s
		curl_setopt($this->ch, CURLOPT_HTTPGET,true);

		//在启用CURLOPT_RETURNTRANSFER时候将获取数据返回
		curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,true);

		//bind to specific ip address if it is sent trough arguments
		if($ip)
		{
			if($this->debug)
			{
				echo "Binding to ip $ip\n";
			}
			//在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名
			curl_setopt($this->ch,CURLOPT_INTERFACE,$ip);
		}

		//设置curl允许执行的最长秒数  $timeout
		curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout);

		//执行一个curl会话
		$result = curl_exec($this->ch);

		if(curl_errno($this->ch))
		{
			if($this->debug)
			{
				echo "Error Occured in Curl\n";
				echo "Error number: " .curl_errno($this->ch) ."\n";
				echo "Error message: " .curl_error($this->ch)."\n";
			}

			return false;
		}
		else
		{
			return $result;
		}
	}

	/**
	 * Fetch data from target URL
	 * and store it directly to file	 	 
	 * @param string url	 
	 * @param resource value stream resource(ie. fopen)
	 * @param string ip address to bind (default null)
	 * @param int timeout in sec for complete curl operation (default 5)
	 * @return boolean true on success false othervise
	 * @access public
	 */
	function fetch_into_file($url, $fp, $ip=null, $timeout=5)
	{
		//需要获取的URL地址
		curl_setopt($this->ch, CURLOPT_URL,$url);

		//启用时会设置HTTP的method为GET,因为GET是默认是,所以只在被修改的情况下使用
		curl_setopt($this->ch, CURLOPT_HTTPGET, true);

		//设置输出文件的位置,值是一个资源类型,默认为STDOUT (浏览器)。
		curl_setopt($this->ch, CURLOPT_FILE, $fp);

		//bind to specific ip address if it is sent trough arguments
		if($ip)
		{
			if($this->debug)
			{
				echo "Binding to ip $ip\n";
			}
			//在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名。
			curl_setopt($this->ch, CURLOPT_INTERFACE, $ip);
		}

		//设置curl允许执行的最长秒数 $timeout
		curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout);

		//执行一个curl会话
		$result = curl_exec($this->ch);

		if(curl_errno($this->ch))
		{
			if($this->debug)
			{
				echo "Error Occured in Curl\n";
				echo "Error number: " .curl_errno($this->ch) ."\n";
				echo "Error message: " .curl_error($this->ch)."\n";
			}

			return false;
		}
		else
		{
			return true;
		}
	}

	/**
	 * Send multipart post data to the target URL	 
	 * return data returned from url or false if error occured
	 * (contribution by vule nikolic, [email protected])
	 * @param string url
	 * @param array assoc post data array ie. $foo['post_var_name'] = $value
	 * @param array assoc $file_field_array, contains file_field name = value - path pairs
	 * @param string ip address to bind (default null)
	 * @param int timeout in sec for complete curl operation (default 30 sec)
	 * @return string data
	 * @access public
	 */
	function send_multipart_post_data($url, $postdata, $file_field_array=array(), $ip=null, $timeout=30)
	{
		//需要获取的URL地址
		curl_setopt($this->ch, CURLOPT_URL, $url);

		// 在启用CURLOPT_RETURNTRANSFER时候将获取数据返回
		curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);

		//bind to specific ip address if it is sent trough arguments
		if($ip)
		{
			if($this->debug)
			{
				echo "Binding to ip $ip\n";
			}
			//在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名
			curl_setopt($this->ch,CURLOPT_INTERFACE,$ip);
		}

		//设置curl允许执行的最长秒数 $timeout
		curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout);

		//启用时会发送一个常规的POST请求,类型为:application/x-www-form-urlencoded,就像表单提交的一样。
		curl_setopt($this->ch, CURLOPT_POST, true);

		// disable Expect header
		// 设置一个header中传输内容的数组
		$headers = array("Expect: ");
		curl_setopt($this->ch, CURLOPT_HTTPHEADER, $headers);

		// initialize result post array
		$result_post = array();

		//generate post string
		$post_array = array();
		$post_string_array = array();
		if(!is_array($postdata))
		{
			return false;
		}

		foreach($postdata as $key=>$value)
		{
			$post_array[$key] = $value;
			$post_string_array[] = urlencode($key)."=".urlencode($value);
		}

		$post_string = implode("&",$post_string_array);


		if($this->debug)
		{
			echo "Post String: $post_string\n";
		}

		// set post string
		//curl_setopt($this->ch, CURLOPT_POSTFIELDS, $post_string);


		// set multipart form data - file array field-value pairs
		if(!empty($file_field_array))
		{
			foreach($file_field_array as $var_name => $var_value)
			{
				if(strpos(PHP_OS, "WIN") !== false) $var_value = str_replace("/", "\\", $var_value); // win hack
				$file_field_array[$var_name] = "@".$var_value;
			}
		}

		// 在HTTP中的“POST”操作。如果要传送一个文件,需要一个@开头的文件名
		$result_post = array_merge($post_array, $file_field_array);
		curl_setopt($this->ch, CURLOPT_POSTFIELDS, $result_post);


		//执行一个curl会话
		$result = curl_exec($this->ch);

		if(curl_errno($this->ch))
		{
			if($this->debug)
			{
				echo "Error Occured in Curl\n";
				echo "Error: " .curl_errno($this->ch) ."\n";
				echo "Message: " .curl_error($this->ch)."\n";
			}

			return false;
		}
		else
		{
			return $result;
		}
	}

	/**
	 * Set file location where cookie data will be stored and send on each new request
	 * @param string absolute path to cookie file (must be in writable dir)
	 * @access public
	 */
	function store_cookies($cookie_file)
	{
		//连接关闭以后,存放cookie信息的文件名称 (cookies stored in $cookie_file)
		curl_setopt ($this->ch, CURLOPT_COOKIEJAR, $cookie_file);
		//包含cookie信息的文件名称,这个cookie文件可以是Netscape格式或者HTTP风格的header信息
		curl_setopt ($this->ch, CURLOPT_COOKIEFILE, $cookie_file);
	}

	/**
	 * Set custom cookie
	 * @param string cookie
	 * @access public
	 */
	function set_cookie($cookie)
	{
		//设定HTTP请求中“Set-Cookie:”部分的内容
		curl_setopt ($this->ch, CURLOPT_COOKIE, $cookie);
	}

	/**
	 * Get last URL info 
	 * usefull when original url was redirected to other location	
	 * @access public
	 * @return string url
	 */
	function get_effective_url()
	{
		//最后一个有效的url地址
		return curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL);
	}

	/**
	 * Get http response code	 
	 * @access public
	 * @return int
	 */
	function get_http_response_code()
	{
		//最后一个收到的HTTP代码
		return curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
	}

	/**
	 * Return last error message and error number
	 * @return string error msg
	 * @access public
	 */
	function get_error_msg()
	{
		//$this->error_no=curl_errno($this->ch);
		echo "Error: " .curl_errno($this->ch) ."\n";
		echo "Message3: " .curl_error($this->ch)."\n";

		return $err;
	}
	//高亮HTML $line_number 显示行号
	function highlightHtml($code,$line_number=false)
	{
		//$code=$this->results;
		$code = htmlspecialchars($code);
		//$code = str_replace(" "," ",$code); //替换空格替换为 
		//$code = nl2br($code); //将回车替换为
//htm标签 //$code = preg_replace_callback('/<([a-zA-Z0-9]+)(.*?)(\/?>)/',array('WebLoad','__pv'), $code); $code = preg_replace('/(<[a-zA-Z0-9]+)/', '$1', $code); $code = preg_replace('/(<\/[a-zA-Z0-9]+>)/', '$1', $code); $code = preg_replace('/(\/>)/', '${1}', $code); //$code = preg_replace('/(<\/?[a-zA-Z]+ .*?>)/','${1}', $code); $code = preg_replace('/<!DOCTYPE\s+.+?>/','${0}',$code); //注释 $code = str_replace('<!--', '<!--', $code); $code = str_replace('-->', '-->', $code); //--------------以下是专为visualTpl模板文件而定制的高亮代码-------- //block : begin|end $code = preg_replace('/(<!--\s*)(begin|end)(\s+)([a-z_\x7f-\xfe]+)/i','${1}${2}${3}${4}', $code); $code = preg_replace('/(\$[a-z0-9_]+)\s*=\s*(per|on)\(([0-9]+),(\'.*?\'),(\'.*?\')\)/i','${1}=${2}(${3},${4},${5})', $code); //vip : vip|endvip $code = preg_replace('/<!--\s*vip/i', '${0}', $code); $code = preg_replace('/<!--\s*endvip\s*--><\/em><\/font>/i', '${0}', $code); //ssi : #include $code = preg_replace('/<!--\s*#include\s+file.+?-->/i','${0}',$code); //无格式变量 $code = preg_replace('/(\{\$[a-zA-Z0-9_\x7f-\xfe]+\})/','${1}', $code); //格式变量 $code = preg_replace('/(\{\$[a-zA-Z0-9_\x7f-\xfe]+;)([a-zA-Z]+)=\'([^\']+?)\'\}/','${1}${2}=\'${3}\'}', $code); //--------------定制代码结束-------------- //$code = preg_replace_callback('/>[^<]+?'.$code.'
'; }else{ $code = '
  1. ' . str_replace("\n",'
  2. ',$code) . '
'; return $code; } } function close() { //关闭一个curl会话 curl_close($this->ch); } } ?>

二、调用示例:

set_referrer("");
$curl->set_user_agent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101");
//3.要爬取的URL链接
$url="https://www.zy13.net/"	
$html = $curl->fetch_url($url);
echo $html;
die;
?>

 

你可能感兴趣的:(PHP)