php实现的简单的csdn博客文章抓取

 

粹php、regul expression练手之用,高手请绕道走

效果图:

php实现的简单的csdn博客文章抓取_第1张图片

怎一个丑字了得!

代码:

/////list.php

<html> <head> <meta http-equiv=Content-Type content="text/html;charset=utf-8"> <title>for list csdn blog entry</title> </head> <h1 align="center"> CSDN BLOG ENTRY LIST </h1> <form method="GET" action="get_csdnblog.php"> <br /> <br /> <br /> <table cellSpacing=0 cellPadding=0 width=400 align="center"> <tr> <td align="center"><input maxLength=256 size=50 name="blogname" type="text" value="yysdsyl"> <input type=submit value=" List "></td> </tr> </table> </form> </body> </html>

////////get_csdnblog.php

<?php set_time_limit(0); header("Content-Type:text/html; charset=utf-8"); ///get url source file function get_url_content($url) { if(extension_loaded('curl')) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } !$content && die("get $url content error."); return $content; } //////csdn blog $blogname = empty($_POST['blogname']) ? $_GET['blogname'] : $_POST['blogname']; $blog_url = "http://blog.csdn.net/$blogname"; $blog_content = get_url_content($blog_url); //////blog title $reg_pattern = '/<title>/s*(.*)/s*<//title>/'; preg_match($reg_pattern, $blog_content, $matches); if($matches) { $blog_title = $matches[1]; echo "<h1>CSDN BLOG : <br /><a href="/" mce_href="/""$blog_url/" target=/"_blank/">$blog_title</a></h1>"; } /////blog archive by month $reg_pattern = '/<li><a/s+href="(//[a-zA-Z0-9_-]+//archive///d+///d+.aspx)">(/d+年/d+月/(/d+/))<//a><//li>/'; if(preg_match_all($reg_pattern, $blog_content, $matches)) { $blog_archive = $matches[1]; $blog_yy_mm = $matches[2]; } else die("get blog archive error"); $page = empty($_GET['page']) ? 1 : intval($_GET['page']); ////pages $count = count($blog_archive); $php_self = $_SERVER['PHP_SELF'] ? $_SERVER['PHP_SELF'] : $_SERVER['SCRIPT_NAME']; $php_self .= '?blogname='.$blogname.'&page='; $page_range = range(1, $count, 1); foreach($page_range as $perpage) { $page_title = $blog_yy_mm[$perpage - 1]; if($perpage == $page) echo $page_title."    "; else echo "<a href="/" mce_href="/""{$php_self}$perpage/">$page_title</a>    "; } echo "<br /><br /><br />"; ////list blog entries $archive_url = 'http://blog.csdn.net'.$blog_archive[$page - 1]; $archive_content = get_url_content($archive_url); $order = array("/r/n", "/n", "/r"); $replace = ''; $archive_content = str_replace($order, $replace, $archive_content); $archive_content = str_replace("阅读全文","/n阅读全文",$archive_content); ////list blog entry $reg_pattern = '/<div/s+class=/"user_article">.+<a/s+href=/"(//[a-zA-Z0-9_-]+//archive///d/d/d/d///d/d///d/d///d+.aspx)/">(.+)<//a><//h1>.+/s+阅读全文><//a><//code>.+(/s*发表于 @ /d/d/d/d年/d/d月/d/d日 /d/d:/d/d:/d/d) /'; if(preg_match_all($reg_pattern, $archive_content, $matches)) { $count = count($matches[0]); echo "<table>"; for($i = 0; $i < $count; $i++) echo "<tr><td><a href="/" mce_href="/""http://blog.csdn.net{$matches[1][$i]}/" target=/"_blank/">{$matches[2][$i]}</a></td><td>{$matches[3][$i]}</td></tr>"; echo "</table>"; } else die("list entry error"); ?>

你可能感兴趣的:(php实现的简单的csdn博客文章抓取)