php采集远程文章简单类

<?php

/**

 * 采集类

 * @author Milkcy 

 * @copyright            (C) 2012-2015 TCCMS.COM

 * @lastmodify             2012-07-10 14:00

 */

class gather {



    public $pagestring = '';

    private $db;



    function __construct() {

        global $db;

        $this->db = $db;

    }



    function geturlfile($url) {

        $url = trim($url);

        $content = '';

        if (extension_loaded('curl')) {

            $ch = curl_init();

            curl_setopt($ch, CURLOPT_URL, $url);

            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

            curl_setopt($ch, CURLOPT_HEADER, 0);

            $content = curl_exec($ch);

            curl_close($ch);

        } else {

            $content = file_get_contents($url);

        }

        return trim($content);

    }



    function get_all_url($code) {

        preg_match_all('/<a.+?href=["|\\']?([^>"\\' ]+)["|\\']?\\s*[^>]*>([^>]+)<\\/a>/is', $code, $arr);

        return array('name' => $arr[2], 'url' => $arr[1]);

    }



    function get_sub_content($str, $start, $end) {

        $start = trim($start);

        $end = trim($end);

        if ($start == '' || $end == '') {

            return $str;

        }

        $str = explode($start, $str);

        $str = explode($end, $str[1]);

        return $str[0];

    }



    function vd($var) {

        echo "<div style=\\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\\">\\r\\n";

        echo "<pre style=\\"font-family:Arial,Vrinda;font-size:14px;\\">\\r\\n";

        var_dump($var);

        echo "\\r\\n</pre>\\r\\n";

        echo "</div>";

    }



}



?>



<?php

define('ROOT_PATH', str_replace('\\\\', '/', dirname(__FILE__)));

include ROOT_PATH."/gather.class.php";

set_time_limit(0);

header("Content-type: text/html; charset=gb2312");

//目标网址

$url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html';

//实例化采集机器

$gather = new gather();

//获取目标网址HTML

$html = $gather->geturlfile($url);

//定义采集列表区间

$start = '<div class="bd clearfix">';

$end = '<div class="pages-1 mt25">';

//获取区间内的文章URL和TITLE

$code = $gather->get_sub_content($html, $start, $end);

$newsAry = $gather->get_all_url($code);

//打印出结果

//$gather->vd($newsAry);

$tarGetUrl = $newsAry['url'][0];

//获取目标网址HTML

$html = $gather->geturlfile($tarGetUrl);

//定义采集列表区间

$start = '<div id="endText">';

$end = '<span class="cDGray right" style="white-space:nowrap;">';

//获取区间内的文章URL和TITLE

$code = $gather->get_sub_content($html, $start, $end);

$killHtml = '<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>';

$killHtml2 = '<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>';

$code = str_replace($killHtml, "", $code);

$code = str_replace($killHtml2, "", $code);

$gather->vd($code);

?>

//该片段来自于http://outofmemory.cn

 

 

php 文章采集正则代码

//采集html 

function getwebcontent($url){ 

$ch = curl_init(); 

$timeout = 10; 

curl_setopt($ch, CURLOPT_URL, $url); 

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 

curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 

$contents = trim(curl_exec($ch)); 

curl_close($ch); 

return $contents; 

} 





//获得标题和url 

$string = 

getwebcontent('http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2'); 

//正则匹配<li>获取标题和地址 

preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER);

foreach($out as $key => $value){ 

$article['title'][] = $out[$key][2]; 

$article['link'][] = "http://www.***.com/learn/article/".$out[$key][1]; 

} 

//根据url获取文章内容 

foreach($article['link'] as $key=>$value){ 

$content_html = getwebcontent($article['link'][$key]); 

preg_match("/<div id=pagenum_0(.*)>[\s|\S]*?<\/div>/",$content_html,$matches); 

$article[content][$key] = $matches[0]; 



} 

//不转码还真不能保存成文件 

foreach($article[title] as $key=>$value){ 

$article[title][$key] = iconv('utf-8', 'gbk', $value);//转码 

} 

//存入文件 

$num = count($article['title']); 

for($i=0; $i<$num; $i++){ 

file_put_contents("{$article[title][$i]}.txt", $article['content'][$i]); 

} 

?> 

 

你可能感兴趣的:(PHP)