部分源码来自:http://www.361way.com/php-curl-url/2779.html
<?php //指定测试的地址(页面): $viewUrl = ""; $host = ""; function _striplinks($document) { preg_match_all("'<\s*a\s.*?href\s*=\s*([\"\'])?(?(1) (.*?)\\1 | ([^\s\>]+))'isx", $document, $links); // catenate the non-empty matches from the conditional subpattern while (list($key, $val) = each($links[2])) { if (!empty($val)) $match[] = $val; } while (list($key, $val) = each($links[3])) { if (!empty($val)) $match[] = $val; } // return the links return $match; } /*===================================================================*\ Function: _expandlinks Purpose: expand each link into a fully qualified URL Input: $links the links to qualify $URI the full URI to get the base from Output: $expandedLinks the expanded links \*===================================================================*/ function _expandlinks($links,$URI){ $URI_PARTS = parse_url($URI); $host = $URI_PARTS["host"]; preg_match("/^[^\?]+/",$URI,$match); $match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]); $match = preg_replace("|/$|","",$match); $match_part = parse_url($match); $match_root = $match_part["scheme"]."://".$match_part["host"]; $search = array( "|^http://".preg_quote($host)."|i", "|^(\/)|i", "|^(?!http://)(?!mailto:)|i", "|/\./|", "|/[^\/]+/\.\./|" ); $replace = array( "", $match_root."/", $match."/", "/", "/" ); $expandedLinks = preg_replace($search,$replace,$links); return $expandedLinks; } function _checkOk($url, $key){ $headers = @get_headers($url); if ($headers[0] == 'HTTP/1.0 404 Not Found') { echo "\n".$key; echo '---<div style=" color: red; font-size: 18px; font-weight: bold;"> Not Found </div>'; echo "\n"; } else { echo "\n".$key; echo "---- ok \n"; } } /* * 使用curl 的所有链接。 */ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); // 只需返回HTTP header curl_setopt($ch, CURLOPT_HEADER, 1); // 页面内容我们并不需要 // curl_setopt($ch, CURLOPT_NOBODY, 1); // 返回结果,而不是输出它 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $html = curl_exec($ch); $info = curl_getinfo($ch); if ($html === false) { echo "cURL Error: " . curl_error($ch); } curl_close($ch); $linkarr = _striplinks($html); // 主机部分,补全用 if (is_array($linkarr)) { foreach ($linkarr as $k => $v) { $linkresult[$k] = _expandlinks($v, $host); } } $linkresult = array_unique($linkresult); echo '<pre>'; foreach ($linkresult as $key => $value){ echo $key."--".$value; _checkOk($value,$key); } echo "\n ==============RUN OVER=============== \n"; //printf("<p>此页面的所有链接为:</p><pre>%s</pre>\n", var_export($linkresult , true));