# $url:网址 #$encode:编码 function htmload($url,$encode='UTF-8'){ $pageinfo = array(); $pageinfo['content_type'] = ''; $pageinfo['charset'] = ''; $pageinfo['title'] = ''; $pageinfo['description'] = ''; $pageinfo['keywords'] = ''; $pageinfo['body'] = ''; $pageinfo['httpcode'] = 200; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0); curl_setopt($ch, CURLOPT_TIMEOUT, 8); curl_setopt($ch, CURLOPT_FILETIME, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_URL,$url); $curl_start = microtime(true); $store = curl_exec ($ch); $curl_time = microtime(true) - $curl_start; if( curl_error($ch) ) { $pageinfo['httpcode'] = 505; return $pageinfo; } $pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE); $pageinfo['content_type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE); if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE))){ return $pageinfo; } preg_match('#charset=([^/s/n/r]+)#i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset if( trim($matches[1]) ){ $pageinfo['charset'] = trim($matches[1]); } curl_close ($ch); $store = preg_replace("/<mce:script.*><!--(.*)<\/script>/smUi",'',$store); $store = preg_replace("/<link\/s+[^>]+>/smUi",'',$store); $store = preg_replace("/<!--.*-->/smUi",'',$store); $store = preg_replace("/<style.*>(.*)<\/style>/smUi",'',$store); $store = preg_replace("/ /",'',$store); if($pageinfo['charset'] == '' ) { preg_match('@<meta.+charset=([/w/-]+)[^>]*>@i',$store,$matches); $pageinfo['charset'] = trim($matches[1]); } preg_match('/<meta\s+name=\"description\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches); $desc = trim($matches[1]); $pageinfo['description'] = get_encoding(str_replace("/", '',$desc),$encode); preg_match('/<meta\s+name=\"keywords\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches); $keywords = trim($matches[1]); $pageinfo['keywords'] = str_replace("/", '',$keywords); preg_match("/<title>(.*)<\/title>/smUi",$store, $matches); $pageinfo['title'] = get_encoding(trim($matches[1]),$encode); preg_match("/<body.*>(.*)<\/body>/smUi",$store, $matches); $pageinfo['body'] = get_encoding(addslashes(clhtml($matches[1])),$encode); return $pageinfo; } #清理内容空格转义字符及js代码 function clhtml($document){ $document = trim(preg_replace("/\s| /","",$document)); if (strlen($document) <= 0){ return $document; } $search = array ( "'<script[^>]*?>.*?</script>'si", "'<[///!]*?[^<>]*?>'si", "'[/r/n/s+]'", "'&([/w]+);'i", "'&bp;'" ); $replace = array ( "","","", "",""); return @preg_replace ($search, $replace, $document); } #转码 function get_encoding($data,$to){ $encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP'); $encoded = mb_detect_encoding($data, $encode_arr); $data = mb_convert_encoding($data,$to,$encoded); return $data; }
代码参考了网络上的一部分,但网上的基本上都有BUG,我修改和优化了一下,留着说不定将来的某一天我也能做搜索引擎的时候用 哈哈。。