PHPCrawler抓取酷狗精选集歌单

一、PHPCrawler的介绍与安装

先了解一下什么是抓取?
抓取就是网络爬虫,也就是人们常说的网络蜘蛛(spider)。是搜索引擎的一个重要组成部分,按照一定的逻辑和算法抓取和下载互联网上的信息和网页。一般的爬虫从一个start url开始,按照一定的策略开始爬取,把爬取到的新的url放入爬取队列中,然后进行新一轮的爬取,直到抓取完毕为止。
PHPCrawler是一个国外开源的爬虫系统,它的源码托管在sourceforge里,这是它的下载地址:点击打开链接
,根据自己电脑里安装的PHP版本选择合适的版本下载。下载完毕之后,解压到服务器网站根目录下,复制example.php文件,并重命名。

二、完整源码

").
    if (PHP_SAPI == "cli") $lb = "\n";
    else $lb = "
"; // Print the URL and the HTTP-status-Code echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb; // Print the refering URL echo "Referer-page: ".$DocInfo->referer_url.$lb; // Print if the content of the document was be recieved or not if ($DocInfo->received == true) echo "Content received: ".$DocInfo->bytes_received." bytes".$lb; else echo "Content not received".$lb; // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example //echo $DocInfo->source; //echo $lb; $url=$DocInfo->url; $pat="/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/"; if(preg_match($pat,$url)>0){ $this->parseSonglistDetail($DocInfo); } flush(); } public function parseSonglistDetail($DocInfo){ $songlistArr=array(); $songlistArr['raw_url']=$DocInfo->url; $content=$DocInfo->content; //名称 $matches=array(); $pat="/名称:<\/span>([^(/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['title']=$matches[1]; }else{ $songlistArr['title']=""; print "error:get title fail
"; } //创建人 $matches=array(); $pat="/创建人:<\/span>([^(/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['creator']=$matches[1]; }else{ $songlistArr['creator']=""; print "error:get creator fail
"; } //创建时间 $matches=array(); $pat="/更新时间:<\/span>([^(/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['create_date']=$matches[1]; }else{ $songlistArr['create_date']=""; print "error:get create_date fail
"; } //简介 $matches=array(); $pat="/简介:<\/span>([^(<\/p)]*)<\/p>/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['info']=$matches[1]; }else{ $songlistArr['info']=""; print "error:get info fail
"; } //歌曲 $matches=array(); $pat="/0){ $songlistArr['songs']=array(); for($i=0;$i$song_title)); } }else{ $songlistArr['song']=""; print "error:get song fail
"; } echo "
";
        print_r($songlistArr);
        echo "
"; $this->saveSonglist($songlistArr); } public function saveSonglist($songlistArr){ //连接数据库 $conn=mysql_connect("localhost","root","root"); mysql_select_db("songlist",$conn); mysql_query("set names utf8"); $songlist=array(); $songlist['title']=mysql_escape_string($songlistArr['title']); $songlist['create_time']=mysql_escape_string($songlistArr['create_date']); $songlist['creator']=mysql_escape_string($songlistArr['creator']); $songlist['raw_url']=mysql_escape_string($songlistArr['raw_url']); $songlist['info']=mysql_escape_string($songlistArr['info']); $sql="insert into songlist set". "title=''".$songlist['title']."'". ",creat_time=''".$songlist['create_time']."'". ",creator=''".$songlist['creator']."'". ",raw_url=''".$songlist['raw_url']."'". ",info=''".$songlist['info']."';"; mysql_query($sql,$conn); $songlist_id=mysql_insert_id(); foreach($songlistArr['songs'] as $song){ $title=mysql_escape_string($song['title']); $sql="insert into song set title='".$title."'" .",songlist_id=".$songlist_id.";"; mysql_query($sql); } mysql_close($conn); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. //创建一个爬虫 $crawler = new MyCrawler(); //设置一个开始的连接 // URL to crawl $start_url="www.kugou.com/yy/special/index/1-0-2.html"; $crawler->setURL($start_url); //设置内容的类型 // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); //忽略图片,设置那些连接不需要下载 //每一个精选集的连接 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");//i 忽略大小写 //精选集页面的链接 下一页 $crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/\d+-0-2.html# i"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //数据内容的容量,多少m,0是无限的 $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") $lb = "\n"; else $lb = "
"; echo "Summary:".$lb; echo "Links followed: ".$report->links_followed.$lb; echo "Documents received: ".$report->files_received.$lb; echo "Bytes received: ".$report->bytes_received." bytes".$lb; echo "Process runtime: ".$report->process_runtime." sec".$lb; ?>


你可能感兴趣的:(PHP)