2019独角兽企业重金招聘Python工程师标准>>>
部分源码来自:http://www.361way.com/php-curl-url/2779.html
]+))'isx", $document, $links);
// catenate the non-empty matches from the conditional subpattern
while (list($key, $val) = each($links[2])) {
if (!empty($val))
$match[] = $val;
} while (list($key, $val) = each($links[3])) {
if (!empty($val))
$match[] = $val;
}
// return the links
return $match;
}
/*===================================================================*\
Function: _expandlinks
Purpose: expand each link into a fully qualified URL
Input: $links the links to qualify
$URI the full URI to get the base from
Output: $expandedLinks the expanded links
\*===================================================================*/
function _expandlinks($links,$URI){
$URI_PARTS = parse_url($URI);
$host = $URI_PARTS["host"];
preg_match("/^[^\?]+/",$URI,$match);
$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
$match = preg_replace("|/$|","",$match);
$match_part = parse_url($match);
$match_root =
$match_part["scheme"]."://".$match_part["host"];
$search = array( "|^http://".preg_quote($host)."|i",
"|^(\/)|i",
"|^(?!http://)(?!mailto:)|i",
"|/\./|",
"|/[^\/]+/\.\./|"
);
$replace = array( "",
$match_root."/",
$match."/",
"/",
"/"
);
$expandedLinks = preg_replace($search,$replace,$links);
return $expandedLinks;
}
function _checkOk($url, $key){
$headers = @get_headers($url);
if ($headers[0] == 'HTTP/1.0 404 Not Found') {
echo "\n".$key;
echo '--- Not Found ';
echo "\n";
} else {
echo "\n".$key;
echo "---- ok \n";
}
}
/*
* 使用curl 的所有链接。
*/
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
// 只需返回HTTP header
curl_setopt($ch, CURLOPT_HEADER, 1);
// 页面内容我们并不需要
// curl_setopt($ch, CURLOPT_NOBODY, 1);
// 返回结果,而不是输出它
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($ch);
$info = curl_getinfo($ch);
if ($html === false) {
echo "cURL Error: " . curl_error($ch);
}
curl_close($ch);
$linkarr = _striplinks($html);
// 主机部分,补全用
if (is_array($linkarr)) {
foreach ($linkarr as $k => $v) {
$linkresult[$k] = _expandlinks($v, $host);
}
}
$linkresult = array_unique($linkresult);
echo '';
foreach ($linkresult as $key => $value){
echo $key."--".$value;
_checkOk($value,$key);
}
echo "\n ==============RUN OVER=============== \n";
//printf("此页面的所有链接为:
%s
\n", var_export($linkresult , true));