php数据采集

<?php


  function get_page_content($url){
	$url = preg_replace('/^http:\/\//i', '', $url);
	$temp = explode('/', $url);
	$host = array_shift($temp);
	$path = '/'.implode('/', $temp);
	$temp = explode(':', $host);
	$host = $temp[0];
	$port = isset($temp[1]) ? $temp[1] : 80;
	$fp = @fsockopen($host, $port, &$errno, &$errstr, 30);
	if ($fp){
	@fputs($fp, "GET $path HTTP/1.1\r\nHost: $host\r\nAccept: */*\r\nReferer:$url\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)\r\nConnection: Close\r\n\r\n");
	}
	$Content = '';
	while ($str = @fread($fp, 4096)){
	$Content .= $str;
	}
	@fclose($fp);
	//重定向
	if(preg_match("/^HTTP\/\d.\d 301 Moved Permanently/is",$Content)){
	if(preg_match("/Location:(.*?)\r\n/is",$Content,$murl)){
	return get_page_content($murl[1]);
	}
}
//读取内容
if(preg_match("/^HTTP\/\d.\d 200 OK/is",$Content)){
preg_match("/Content-Type:(.*?)\r\n/is",$Content,$murl);
$contentType=trim($murl[1]);
$Content=explode("\r\n\r\n",$Content,2);
$Content=$Content[1];
}
return $Content;
}

function go($page){
$aa = get_page_content("http://www.csto.com/member/pg:".$page);
//echo $aa;
 $arr = array();
  preg_match_all('/<span class="authentication fr">(.*?)<span class="status">/s',$aa,$row);
  //print_r($row[1]);
  $html_arr = $row[1];
  foreach($html_arr as $key=>$value){
	preg_match_all('/ <a href=".*?">(.*?)<\/a>/s',$value,$row2);
	//print_r($row2);
	
	//$url = $row2[1][0];
	$name = $row2[1][0];
	$url =  "http://www.csto.com/u/".urlencode($name)."/profile";
	//echo $url;
	//break;
	$profile = get_page_content($url);
	//echo $profile;
	preg_match_all('/<div class="contbox">(.*?)<div class="talent_right">/s',$profile,$row3);
	//print_r($row3);
    $contbox = $row3[1][0];
	//echo $contbox;
	preg_match_all('/[\w.%-]+@[\w.-]+\.[a-z]{2,4}/',$contbox,$row4);
	$email = (count($row4[0])>0)?$row4[0][0]:"";
	//echo $email;
	//print_r($row4);
	if($email!="")
	array_push($arr, array("name"=>$name,"email"=>$email));
	//break;
  }
  return $arr;
}  
echo ",";
//echo $_GET['page'];
echo json_encode(go($_GET['page']));
//print_r(go(1));
?>

 

你可能感兴趣的:(PHP)