php采集器分析功能实现

php采集器分析功能实现

a)URL地址分析
根据数据库中采集规则的定义字段,分为一页地址,多页地址,**—**页地址三种形式。

//流程控制
switch (URL地址形式)
{
case 1:
//单页的设置
$url[0] = URL地址;
break;
case 2:
//多页的设置
$manypage = explode("\n",$row->URL地址); //分割一行一个网址
$url = $manypage;
break;
case 3:
//XX页—XX页的设置
$rowurl = explode('[分页]', $row->URL地址);
$a_num = 0;
for ($a=$a_num;$a<=$row->结束页;$a++)
{
$url[$a] = $rowurl[0].$a.$rowurl[1];
if ($a==0)
{
$url[$a] = $rowurl[0].$row->url_start.$rowurl[1];
}
}
}


b)内容页面链接分析

//载入URL地址页
$handles = @file_get_contents($url);
//正则表达式匹配链接规则
preg_match_all ("/".链接规则."/is",$handles,$matches);
//所得内容页面链接地址写入数据库Link表
INSERT INTO `links` ( `title` , `url` , `rules` , `date` )

c)延时函数

//参考:http://cn.php.net/manual/zh/ref.curl.php
$ch = curl_init();
$timeout = 10; // set to zero for no timeout
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$handles = curl_exec($ch);
curl_close($ch);

d)分页分析:全部列出形式

//正则匹配分页区域
preg_match ("/".分页规则."/is",$buffer, $regs2);
//查找每个分割里面的网址 for保证网址不重复
preg_match_all ("/".'<[^<>]*(href|value)=(\”|\’)?([^\’\”<>]*)(\”|\’)?[^<>]*>’.”/is”,$regs2[1],$regs3);
for ($i = 0; $i <= count($regs3[3]); $i++)
{
$gethttp = $string->gethttp($regs3[3][$i]);
$buffer2 = @file_get_contents($gethttp);
//延时
if(empty($buffer2)){
if($phpcurl_init == “yes”){
$ch = curl_init();
$timeout = 10; // set to zero for no timeout
curl_setopt ($ch, CURLOPT_URL, $regsar[$i]);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$buffer2 = curl_exec($ch);
curl_close($ch);
}
preg_match (”/”.内容规则.”/is”,$buffer2, $regss);
$cont .= $regss[1];
}

e)分页分析:上下页形式

if(preg_match ("/".分页规则."/is",$buffer, $ljregs))
{
while($ljregs[1] != "")
{
$ljregs[1] = $string->gethttp($ljregs[1]);
$buffer = @file_get_contents($ljregs[1]);
//延时
if(empty($buffer)){
if($phpcurl_init == "yes"){
$ch = curl_init();
$timeout = 10; // set to zero for no timeout
curl_setopt ($ch, CURLOPT_URL, $ljregs[1]);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$buffer = curl_exec($ch);
curl_close($ch);
}
}
preg_match ("/".$body_rule."/is",$buffer, $regs) ;
$cont .= $regs[1];
}
}

f)Cookie部分

$string->fileFsock(url, cookie内容);
//fsockopen采集
function fileFsock($url,$cookie)
{
set_time_limit(0);
$cookie = $this->getCookie($cookie);
preg_match("/^(http:\/\/)([^\/]+)(.*)/i", $url, $matches);
$fp = fsockopen($matches[2], 80, $errno, $errstr, 30);
if(!$fp) {
echo "$errstr ($errno)
\n”;
} else {
$out = “POST $matches[3] HTTP/1.1\r\n”;
$out .= “Host:$matches[2]\r\n”;
$out .= “Cookie: “.$cookie.”\r\n”;
$out .= “Connection: Close\r\n\r\n”;
fputs($fp, $out);
fclose($fp);
}
}

g)其他相关函数:给无HOST头的网址加入HOST

function gethttp($url)
{
$url = trim($url);
if(!preg_match ('/http:\/\//i',$url,$out))
{
if(preg_match ('/^(\.{2})\/(.+)?/i',$url,$out)) //带有目录的网址 加+网址
{
return $this->outurl[3].$out[2];//有目录的 或有/的
}

if(preg_match ('/^(\.\/)(.+)?/i',$url,$out))//将.去掉
{
return $this->outurl[1].$out[2]; //无目录的网址 加上网址+目录
}

if(preg_match ('/^\/(.+)/i',$url,$out)) //带有目录的网址 加+网址
{
return $this->outurl[2].$out[1];//有目录的 或有/的
}

return $this->outurl[1].$url;

}else{
return $url; //有HTTP的 不需要任何操作
}

}

你可能感兴趣的:(.net,PHP,正则表达式,F#,FP)