fsockopen采集新浪 http://roll.news.sina.com.cn/news/gnxw/zs-pl/index_1.shtml 新闻列表以及对应文章页

题目:采集地址为 http://roll.news.sina.com.cn/news/gnxw/zs-pl/index_1.shtml,分析出每个页面的标题 内容 作者后插入到表news中。

<?php set_time_limit(0); include("config.inc.php"); include("class/db.cls.php"); include("class/tools.cls.php"); include("class/tpl.cls.php"); $db = new Mysql(); $db->open(); $url = "roll.news.sina.com.cn"; $fp = fsockopen($url,80,$errno,$errstr,20); if(!$fp){ echo "$errno : $errstr"; }else{ $out = "GET /news/gnxw/zs-pl/index_1.shtml HTTP/1.1/r/n"; $out .= "Host: roll.news.sina.com.cn/r/n"; $out .= "Connection: Close/r/n/r/n"; $out .= "Keep-Alive: 300/r/n"; $out .= "Connection: keep-alive/r/n"; fputs($fp, $out); //获取fsockopen的全部内容赋值到$file $file = ''; while(!feof($fp)) { $file .= fgets($fp,128); } fclose($fp); //匹配 preg_match_all("|<li><a href="/" mce_href="/""http:////(.*)/" target=/"_blank/">(.*)</a><span>/((.*)/)</span></li>|U",$file,$out); //获取url title time数组 $urlArray = $out[1]; $titleArray = $out[2]; $timeArray = $out[3]; $total = count($urlArray); $newsHost = "news.sina.com.cn"; $newsHostLen = strlen($newsHost); //添加入数据库 $insertStr = ''; for($i=0;$i<$total;$i++){ $newsUrl = $urlArray[$i]; $urlfp = fsockopen($newsHost,80,$errno,$errstr,20); if(!$urlfp){ echo "$errno : $errstr"; }else{ $outGet = substr($newsUrl,$newsHostLen); $out = "GET $outGet HTTP/1.1/r/n"; $out .= "Host: $newsHost/r/n"; $out .= "Connection: Close/r/n/r/n"; $out .= "Keep-Alive: 300/r/n"; $out .= "Connection: keep-alive/r/n"; fputs($urlfp, $out); //获取fsockopen的全部内容赋值到$file $urlFile = ''; while(!feof($urlfp)) { $urlFile .= fgets($urlfp,128); } fclose($urlfp); $urlOut = array(); //匹配内容 preg_match_all("|<div class=/"blkContainerSblkCon/" id=/"artibody/">(.*)<div style="/" mce_style="/""text-align: right;padding-right:10px;/">(编辑:(/S*))|is",$urlFile,$urlOut); $content = $urlOut[1][0]; $author = $urlOut[2][0]; $insertArray = array(); $insertArray = array( 'url' => htmlspecialchars($urlArray[$i],ENT_QUOTES), 'title'=>htmlspecialchars($titleArray[$i],ENT_QUOTES), 'content'=>htmlspecialchars($content,ENT_QUOTES), 'author'=>htmlspecialchars($author,ENT_QUOTES), 'create_time'=>htmlspecialchars($timeArray[$i],ENT_QUOTES), ); $db->insert($tb_news,$insertArray); } /*$insertStr .= "( '".mysql_real_escape_string(htmlspecialchars($urlArray[$i],ENT_QUOTES))."', '".mysql_real_escape_string(htmlspecialchars($titleArray[$i],ENT_QUOTES))."', '".mysql_real_escape_string(htmlspecialchars($content,ENT_QUOTES))."', '".mysql_real_escape_string(htmlspecialchars($author,ENT_QUOTES))."', '".mysql_real_escape_string(htmlspecialchars($timeArray[$i],ENT_QUOTES))."'),";*/ } //$insertSql = "insert into $tb_news (`url`,`title`,`content`,`author`,`create_time`) values ".substr($insertStr,0,-1); //echo $insertSql; //$db->query($insertSql); } /* $newsHost = "news.sina.com.cn"; $newsUrl = "news.sina.com.cn/c/2011-03-10/093722087345.shtml"; $urlfp = fsockopen($newsHost,80,$errno,$errstr,20); if(!$urlfp){ echo "$errno : $errstr"; }else{ $outGet = substr($newsUrl,strlen($newsHost)); $out = "GET $outGet HTTP/1.1/r/n"; $out .= "Host: $newsHost/r/n"; $out .= "Connection: Close/r/n/r/n"; $out .= "Keep-Alive: 300/r/n"; $out .= "Connection: keep-alive/r/n"; fputs($urlfp, $out); //获取fsockopen的全部内容赋值到$file $file = ''; while(!feof($urlfp)) { $urlFile .= fgets($urlfp,128); } fclose($urlfp); //var_dump($urlFile); //$urlFile=str_replace("//s+/","",$urlFile); //var_dump($urlFile); preg_match_all("|<div class=/"blkContainerSblkCon/" id=/"artibody/">(.*)<div style="/" mce_style="/""text-align: right;padding-right:10px;/">(编辑:(/S*))|is",$urlFile,$urlOut); //preg_match_all("|<div class=/"blkContainerSblkCon/" id=/"artibody/">(.*)<div style="/" mce_style="/""clear:both;height:0;visibility:hiddden;overflow:hidden;/">|is",$urlFile,$urlOut); //$content = $urlOut[1]; $content = $urlOut[1][0]; $author = $urlOut[2][0]; //var_dump($urlOut); echo $content; echo $author; } /**/ ?> 

你可能感兴趣的:(fsockopen采集新浪 http://roll.news.sina.com.cn/news/gnxw/zs-pl/index_1.shtml 新闻列表以及对应文章页)