正则提取url

function getPageLink($url)
{
        set_time_limit(0);
        $html = file_get_contents($url);
        preg_match_all("/]+s*)href=([\"|']?)([^\"'>\s]+)([\"|']?)/ies",$html,$out);
        $arrLink = $out[3];
        $arrUrl = parse_url($url);
        if( isset($arrUrl['path']) && !empty($arrUrl['path']) )
        {
                $dir = str_replace('\\','/',$dir = dirname($arrUrl['path']));
                if( $dir == '/') $dir = '';
        }
        if(is_array($arrLink) && count($arrLink) > 0)
        {
                $arrLink = array_unique($arrLink);
                foreach($arrLink as $key=>$val)
                {
                        $val = strtolower($val);
                        if( preg_match('/^#*$/isU',$val))
                        {
                                unset($arrLink[$key]);
                        }
                        else  if(preg_match('/^\//isU',$val))
                        {
                                $arrLink[$key] = 'http://'.$arrUrl['host'].$val;
                        }
                        else  if(preg_match('/^javascript/isU',$val))
                        {
                                unset($arrLink[$key]);
                        }
                        else  if(preg_match('/^mailto:/isU',$val))
                        {
                                unset($arrLink[$key]);
                        }
                        else  if( !preg_match('/^\//isU',$val) && strpos($val,'http://') === FALSE)
                        {
                                $arrLink[$key] = 'http://'.$arrUrl['host'].$path.'/'.$val;
                        }
                }
        }
        sort($arrLink);
        return $arrLink;
}
        print_r(getPageLink(' http://www.hao123.com'));
?>

转载于:https://my.oschina.net/u/817486/blog/82471

你可能感兴趣的:(正则提取url)