php 正则抓去页面函数整理

整理了下抓取页面的一些函数 方便以后使用 

 

 



//抓取页面

function getcontents($url) {

    $ch = curl_init(); 

    $timeout = 5; 

    curl_setopt($ch, CURLOPT_URL, $url); 

    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 

    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 

    $contents = curl_exec($ch); 

    return $contents;

}



//抓取table

function get_td_array($table) {

    // 去掉 HTML 标记属性

    $table = preg_replace("'<table[^>]*?>'si", "", $table);

    $table = preg_replace("'<tr[^>]*?>'si", "", $table);

    $table = preg_replace("'<td[^>]*?>'si", "", $table);

    $table = str_replace("</tr>", "{tr}", $table);

    $table = str_replace("</td>", "{td}", $table);

    // 去掉 HTML 标记

    

    $table = preg_replace("'<[\/\!]*?[^<>]*?>'si", "", $table);

    

    // 去掉空白字符

    $table = preg_replace("'([\r\n])[\s]+'", "", $table);

    $table = str_replace(" ", "", $table);

    $table = str_replace(" ", "", $table);

    

    $table = explode('{tr}', $table);

    array_pop($table);

    foreach ($table as $key => $tr) {

        $td = explode('{td}', $tr);

        array_pop($td);

        $td_array[] = $td;

    } 

    return $td_array;

}





//post提交数据 模拟采集页面

function curlPost($url,$postData=array()) {

    if(empty($url)) return false;

    $o="";

    foreach ($postData as $k=>$v){

        $o.= "$k=".urlencode($v)."&";

    }

    $postData=substr($o,0,-1);

    $ch = curl_init();



    $timeout = 5; 

    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 

    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 

    

    curl_setopt($ch, CURLOPT_POST, 1);

    curl_setopt($ch, CURLOPT_HEADER, 0);

    curl_setopt($ch, CURLOPT_URL, $url); 

    curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);

    $contents = curl_exec($ch); 

    return $contents;

}



 



// 提交的数据

$postData = array(

    'region_fullname'=>iconv('GBK','UTF-8','黄山'),

    '$total' => $totalPage,

    '$pgsz'=> $prepage,

    '$pg' => $page,

);

$contents = curlPost($url,$postData);

  

 

你可能感兴趣的:(PHP)