网页抓取及下载

downAndroidApk.php

<?php
/*
命令行
d:
cd ApacheServer\php
php.exe D:\ApacheServer\web\crawl\downAndroidApk.php --appidFile=D:\ApacheServer\web\crawl\youxi.txt --newDir=D:\ApacheServer\web\crawl\requestNewDir

*/
// 判断必须在php-cli模式下运行,即命令行执行
if (strtolower(php_sapi_name()) != 'cli') {
    echo " error : this script must run by php-cli mode.";
    exit;
}

echo "\n\n\n =================== begin download =================== \n";
/*
 * 分析参数
 */
//创建变量
$new_dir = $old_dir = $appid_file = '';
$sleep_time = 0;
//strpos() 函数返回字符串在另一个字符串中第一次出现的位置。如果没有找到该字符串,则返回 false。
//$argv存储命令行 php.exe 命令后面每一个用空格分割的字符串组成的数组,索引从0起
//DIRECTORY_SEPARATOR 根据不同的系统返回不同的路径分隔符。windows  "\" 或 "/",linux "/"

foreach ($argv as $k=>$v) {                                         //根据命令行给定参数,设定相应变量值
    if (!$new_dir && strpos($v, '-newDir=')===1) {                    // 设置新保存的APP目录
        $new_dir = substr($v, 9);
        if (substr($new_dir, -1)==DIRECTORY_SEPARATOR) {
            $new_dir = substr($new_dir, 0, -1);
        }
    } elseif (!$old_dir && strpos($v, '-oldDir=')===1) {            // 设置旧的APP存放目录
        $old_dir = substr($v, 9);
        if (substr($old_dir, -1)==DIRECTORY_SEPARATOR) {
            $old_dir = substr($new_dir, 0, -1);
        }
    } elseif (!$appid_file && strpos($v, '-appidFile=')===1) {        // 设置读取ID列表参数
        $appid_file = substr($v, 12);
    }elseif (!$sleep_time && strpos($v, '-sleep=')===1) {            // 设置暂停时间
        $sleep_time = intval(substr($v, 8));
        $sleep_time<=0 ? $sleep_time=3 : 1;
    }
    
}

/*
 * 检测参数
 */
// 判断存储appid列表的文件是否存在
if (!$appid_file || !file_exists($appid_file)) {
    echo 'error : --appidFile error. app id file is not exists!'."\n";
    exit;
}
// 判断保存下载APP的目录是否创建
if (!$new_dir || !file_exists($new_dir)) {
    echo 'error: --newDir error. new app downloaded directory is not exists, please create that.'."\n";
    exit;
}
// 判断以前保持下载目录的目录是否存在
if (!$old_dir || !file_exists($old_dir)) {
    echo '-----warong: --oldDir warong. old app downloaded directory is not exists, can not be compared.'."\n";
    sleep(3);
}



// 引入页面抓取类
require('Snoopy.class.php');
// 引入下载类
require('dedehttpdown.class.php');



// 分析app列表文件
/*
存储appid的youxi.txt文件的存储格式为
appid1
appid2
appid3
$app_id_array通过file()函数获取的是每行字符串组成的数组,索引从0开始
*/
$app_id_array = file($appid_file);
//打开日志文件
$log = '';
$fp = fopen($new_dir.DIRECTORY_SEPARATOR.'log.txt', 'a');
foreach ($app_id_array as $k=>$appid) {
    if (empty($appid)) {
        $log = '-----error : appid "'.$v.'" is can not get info'."\n";
        continue;
    }
    echo '--begin appid: '.$appid."\n";//打印显示开始下载的appid
    $log = getAppInfo($appid);
    echo $log[0]."\n";

    // 打印log
    fwrite($fp, date('Y-m-d H:i:s', time()).' '.$log[0]."\n");
    // 如果下载失败,打印失败log
    if (isset($log[1]) && !empty($log[1])) {
        fwrite($fp, date('Y-m-d H:i:s', time()).' ----------'.$log[1]."----------\n");
    }
    echo '--end appid: '.$appid."\n";

    if ($sleep_time) {
        sleep($sleep_time);
    }
}
fclose($fp);

echo "\n =================== end download =================== \n\n\n";



/*
 * 获取某应用信息
 */
function getAppInfo($app_id) {
    global $new_dir;
    //创建抓取类对象
    $snoopy = new Snoopy();
    //0对应正常下载信息,1对应失败错误信息
    $log = array(0=>'', 1=>'');
    echo "1 get $app_id info start\n";
    //初始化图片地址,应用名称,版本号
    $bigicon_url = $title = $app_version ='';
    //设置要抓取的网页路径
    $url = 'http://www.wandoujia.com/apps/'.$app_id;
    $snoopy->fetch($url); //将路径赋给抓取对象
    $content = $snoopy->results; //获取所有内容
    //通过正则查找将内容中回车换成空格,回车符两边的 //必须加,因为正则内容必须写在//中
    $content = preg_replace("/\n/", ' ', $content);
    //将内容中换行换成空格
    $content = preg_replace("/\r/", ' ', $content);

    // 获取图片地址 bigicon_url
    //$imgs为搜索结果。 $imgs[0]将包含完整模式匹配到的文本, $imgs[1] 将包含第一个捕获子组(正则里第一个括号匹配到的内容)匹配到的文本,以此类推。
    $i = preg_match('/class="app-icon".*?rc="(.*?)"/', $content, $imgs);
    $bigicon_url = $imgs[1];

    // 获取应用名称 title
    if ($i) {
        $i = preg_match('/class="app-name".*?itemprop="name">(.*?)</', $content, $title);
        $title = trim($title[1]);
    }
    
    // 获取版本号 app_version
    if ($i) {
        $i = preg_match('/<dt>版本<\/dt>.*?<dd>(.*?)</', $content, $app_version);
        $app_version = trim($app_version[1]);
    }
    

    // 获取APK包大小
    if ($i) {
        $i = preg_match('/<dt>大小<\/dt>.*?<dd>(.*?)</', $content, $apk_size);
        $apk_size = trim($apk_size[1]);
    }
    //如果获取app信息成功则进行下载
    if ($bigicon_url && $title && $app_version) {
        $app_dir = $new_dir.DIRECTORY_SEPARATOR.$app_id;
        if (!file_exists($app_dir)) {
            mkdir($app_dir);
            chmod($app_dir, 0777);
        }

        $log[0] = $app_id.'|'.$title.'|'.$app_version.'|';

        // 开始下载图片和APK包
        echo "2 get $app_id info finished\n";
        echo "3 get $app_id image start\n";
        $img_size = download($app_id, 'img', $bigicon_url);        // 下载图片
        echo "4 get $app_id image finished\n";
        echo "5 get $app_id app start. size: $apk_size\n";
        $apk_size = str_ireplace('M', '', $apk_size);
        $app_size = download($app_id, 'app', '', $apk_size);    // 下载APP
        echo "6 get $app_id app finished\n";
        //如果返回的文件大小为空则下载失败
        if ($app_size=='B') {
            $log[1] = '-----'. $app_id . ' downloaded apk failure.-----';
            echo $log[1]."\n";
        }
        $log[0] .= $app_size;
        //记录日志文件
        file_put_contents($app_dir.DIRECTORY_SEPARATOR.'apkinfo.txt', $log[0]);
        return $log;
    } else {
        $log[0] = $app_id.'|';
        $log[1] = '-----'.$app_id .' get html failed. -----';
        echo $log[1]."\n";
        return $log;
    }
}


/* 
 * 下载图片和应用
 *        $type = 'img/app' 图片或应用. 
 *        $img_url图片地址, 仅仅type为img时需要
 *        $apk_size在正常下载不好用时需要
 */
function download($app_id, $type='img', $img_url='', $apk_size=0) {
    global $new_dir;
    //创建下载对象
    $httpdown = new DedeHttpDown();
    //下载APP
    if ($type == 'app') {
        // 组建下载地址
        $app_url = 'http://apps.wandoujia.com/apps/'.$app_id.'/download?pos=www/detail';
        echo $app_url."\n";
        // 设置文件保存完整路径
        $app_file = $new_dir . DIRECTORY_SEPARATOR . $app_id . DIRECTORY_SEPARATOR . $app_id.'.apk';
        $file_have = 0;
        //有时候会下载不成功,重试两次,不成功则跳过
        for ($i=1; $i<=2; $i++) {
            echo 'downloaded times: '.$i."\n";
            //设置下载路径
            $httpdown->OpenUrl($app_url);
            //设置保存路径,到保存文件的扩展名,完整路径
            $httpdown->SaveToBin($app_file);
            $httpdown->Close();
            if (file_exists($app_file)) {
                $file_have = 1;
                break;
            }
            //等待三秒,防止下载太快网站封IP
            sleep(3);
        }
        // 如果下载失败,使用snoopy尝试一次
        if (!file_exists($app_file)) {
            echo 'downloaded times: 3'."\n";
            //获取php.ini的内存限制大小
            $memory_limit = ini_get('memory_limit');
            if ($apk_size && $memory_limit-2>$apk_size) {
                $snoopy = new Snoopy();
                $snoopy->fetch($app_url);
                $content = $snoopy->results;//获取内容
                file_put_contents($app_file, $content);
            }
        }
        if (file_exists($app_file)) {
            $file_have = 1;
        }
        unset($httpdown);
        if ($file_have) {
            //获取下载后文件的大小
            return filesize($app_file).'B';
        } else {
            return 'B';
        }
    //否则下载图片
    } elseif ($type == 'img') {
        // 设置文件保存完整路径
        $bigicon = $new_dir . DIRECTORY_SEPARATOR . $app_id . DIRECTORY_SEPARATOR . 'bigicon.png';
        $httpdown->OpenUrl($img_url);
        $httpdown->SaveToBin($bigicon);
        $httpdown->Close();
        unset($httpdown);
        if (file_exists($bigicon)) {
            return filesize($bigicon).'B';
        } else {
            return 'B';
        }
    }
}
?>

Snoopy.class.php

<?php

/*************************************************

Snoopy - the PHP net client
Author: Monte Ohrt <[email protected]>
Copyright (c): 1999-2008 New Digital Group, all rights reserved
Version: 1.2.4

 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

You may contact the author of Snoopy by e-mail at:
[email protected]

The latest version of Snoopy can be obtained from:
http://snoopy.sourceforge.net/

*************************************************/

class Snoopy
{
    /**** Public variables ****/
    
    /* user definable vars */

    var $host            =    "www.php.net";        // host name we are connecting to
    var $port            =    80;                    // port we are connecting to
    var $proxy_host        =    "";                    // proxy host to use
    var $proxy_port        =    "";                    // proxy port to use
    var $proxy_user        =    "";                    // proxy user to use
    var $proxy_pass        =    "";                    // proxy password to use
    
    var $agent            =    "Snoopy v1.2.4";    // agent we masquerade as
    var    $referer        =    "";                    // referer info to pass
    var $cookies        =    array();            // array of cookies to pass
                                                // $cookies["username"]="joe";
    var    $rawheaders        =    array();            // array of raw headers to send
                                                // $rawheaders["Content-type"]="text/html";

    var $maxredirs        =    5;                    // http redirection depth maximum. 0 = disallow
    var $lastredirectaddr    =    "";                // contains address of last redirected address
    var    $offsiteok        =    true;                // allows redirection off-site
    var $maxframes        =    0;                    // frame content depth maximum. 0 = disallow
    var $expandlinks    =    true;                // expand links to fully qualified URLs.
                                                // this only applies to fetchlinks()
                                                // submitlinks(), and submittext()
    var $passcookies    =    true;                // pass set cookies back through redirects
                                                // NOTE: this currently does not respect
                                                // dates, domains or paths.
    
    var    $user            =    "";                    // user for http authentication
    var    $pass            =    "";                    // password for http authentication
    
    // http accept types
    var $accept            =    "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
    
    var $results        =    "";                    // where the content is put
        
    var $error            =    "";                    // error messages sent here
    var    $response_code    =    "";                    // response code returned from server
    var    $headers        =    array();            // headers returned from server sent here
    var    $maxlength        =    500000;                // max return data length (body)
    var $read_timeout    =    0;                    // timeout on read operations, in seconds
                                                // supported only since PHP 4 Beta 4
                                                // set to 0 to disallow timeouts
    var $timed_out        =    false;                // if a read operation timed out
    var    $status            =    0;                    // http request status

    var $temp_dir        =    "/tmp";                // temporary directory that the webserver
                                                // has permission to write to.
                                                // under Windows, this should be C:\temp

    var    $curl_path        =    "/usr/local/bin/curl";
                                                // Snoopy will use cURL for fetching
                                                // SSL content if a full system path to
                                                // the cURL binary is supplied here.
                                                // set to false if you do not have
                                                // cURL installed. See http://curl.haxx.se
                                                // for details on installing cURL.
                                                // Snoopy does *not* use the cURL
                                                // library functions built into php,
                                                // as these functions are not stable
                                                // as of this Snoopy release.
    
    /**** Private variables ****/    
    
    var    $_maxlinelen    =    4096;                // max line length (headers)
    
    var $_httpmethod    =    "GET";                // default http request method
    var $_httpversion    =    "HTTP/1.0";            // default http request version
    var $_submit_method    =    "POST";                // default submit method
    var $_submit_type    =    "application/x-www-form-urlencoded";    // default submit type
    var $_mime_boundary    =   "";                    // MIME boundary for multipart/form-data submit type
    var $_redirectaddr    =    false;                // will be set if page fetched is a redirect
    var $_redirectdepth    =    0;                    // increments on an http redirect
    var $_frameurls        =     array();            // frame src urls
    var $_framedepth    =    0;                    // increments on frame depth
    
    var $_isproxy        =    false;                // set if using a proxy server
    var $_fp_timeout    =    30;                    // timeout for socket connection

/*======================================================================*\
    Function:    fetch
    Purpose:    fetch the contents of a web page
                (and possibly other protocols in the
                future like ftp, nntp, gopher, etc.)
    Input:        $URI    the location of the page to fetch
    Output:        $this->results    the output text from the fetch
\*======================================================================*/

    function fetch($URI)
    {
    
        //preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS);
        $URI_PARTS = parse_url($URI);
        if (!empty($URI_PARTS["user"]))
            $this->user = $URI_PARTS["user"];
        if (!empty($URI_PARTS["pass"]))
            $this->pass = $URI_PARTS["pass"];
        if (empty($URI_PARTS["query"]))
            $URI_PARTS["query"] = '';
        if (empty($URI_PARTS["path"]))
            $URI_PARTS["path"] = '';
                
        switch(strtolower($URI_PARTS["scheme"]))
        {
            case "http":
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_connect($fp))
                {
                    if($this->_isproxy)
                    {
                        // using proxy, send entire URI
                        $this->_httprequest($URI,$fp,$URI,$this->_httpmethod);
                    }
                    else
                    {
                        $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                        // no proxy, send only the path
                        $this->_httprequest($path, $fp, $URI, $this->_httpmethod);
                    }
                    
                    $this->_disconnect($fp);

                    if($this->_redirectaddr)
                    {
                        /* url was redirected, check if we've hit the max depth */
                        if($this->maxredirs > $this->_redirectdepth)
                        {
                            // only follow redirect if it's on this site, or offsiteok is true
                            if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                            {
                                /* follow the redirect */
                                $this->_redirectdepth++;
                                $this->lastredirectaddr=$this->_redirectaddr;
                                $this->fetch($this->_redirectaddr);
                            }
                        }
                    }

                    if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                    {
                        $frameurls = $this->_frameurls;
                        $this->_frameurls = array();
                        
                        while(list(,$frameurl) = each($frameurls))
                        {
                            if($this->_framedepth < $this->maxframes)
                            {
                                $this->fetch($frameurl);
                                $this->_framedepth++;
                            }
                            else
                                break;
                        }
                    }                    
                }
                else
                {
                    return false;
                }
                return true;                    
                break;
            case "https":
                if(!$this->curl_path)
                    return false;
                if(function_exists("is_executable"))
                    if (!is_executable($this->curl_path))
                        return false;
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_isproxy)
                {
                    // using proxy, send entire URI
                    $this->_httpsrequest($URI,$URI,$this->_httpmethod);
                }
                else
                {
                    $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                    // no proxy, send only the path
                    $this->_httpsrequest($path, $URI, $this->_httpmethod);
                }

                if($this->_redirectaddr)
                {
                    /* url was redirected, check if we've hit the max depth */
                    if($this->maxredirs > $this->_redirectdepth)
                    {
                        // only follow redirect if it's on this site, or offsiteok is true
                        if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                        {
                            /* follow the redirect */
                            $this->_redirectdepth++;
                            $this->lastredirectaddr=$this->_redirectaddr;
                            $this->fetch($this->_redirectaddr);
                        }
                    }
                }

                if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                {
                    $frameurls = $this->_frameurls;
                    $this->_frameurls = array();

                    while(list(,$frameurl) = each($frameurls))
                    {
                        if($this->_framedepth < $this->maxframes)
                        {
                            $this->fetch($frameurl);
                            $this->_framedepth++;
                        }
                        else
                            break;
                    }
                }                    
                return true;                    
                break;
            default:
                // not a valid protocol
                $this->error    =    'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
                return false;
                break;
        }        
        return true;
    }

/*======================================================================*\
    Function:    submit
    Purpose:    submit an http form
    Input:        $URI    the location to post the data
                $formvars    the formvars to use.
                    format: $formvars["var"] = "val";
                $formfiles  an array of files to submit
                    format: $formfiles["var"] = "/dir/filename.ext";
    Output:        $this->results    the text output from the post
\*======================================================================*/

    function submit($URI, $formvars="", $formfiles="")
    {
        unset($postdata);
        
        $postdata = $this->_prepare_post_body($formvars, $formfiles);
            
        $URI_PARTS = parse_url($URI);
        if (!empty($URI_PARTS["user"]))
            $this->user = $URI_PARTS["user"];
        if (!empty($URI_PARTS["pass"]))
            $this->pass = $URI_PARTS["pass"];
        if (empty($URI_PARTS["query"]))
            $URI_PARTS["query"] = '';
        if (empty($URI_PARTS["path"]))
            $URI_PARTS["path"] = '';

        switch(strtolower($URI_PARTS["scheme"]))
        {
            case "http":
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_connect($fp))
                {
                    if($this->_isproxy)
                    {
                        // using proxy, send entire URI
                        $this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submit_type,$postdata);
                    }
                    else
                    {
                        $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                        // no proxy, send only the path
                        $this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submit_type, $postdata);
                    }
                    
                    $this->_disconnect($fp);

                    if($this->_redirectaddr)
                    {
                        /* url was redirected, check if we've hit the max depth */
                        if($this->maxredirs > $this->_redirectdepth)
                        {                        
                            if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
                                $this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);                        
                            
                            // only follow redirect if it's on this site, or offsiteok is true
                            if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                            {
                                /* follow the redirect */
                                $this->_redirectdepth++;
                                $this->lastredirectaddr=$this->_redirectaddr;
                                if( strpos( $this->_redirectaddr, "?" ) > 0 )
                                    $this->fetch($this->_redirectaddr); // the redirect has changed the request method from post to get
                                else
                                    $this->submit($this->_redirectaddr,$formvars, $formfiles);
                            }
                        }
                    }

                    if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                    {
                        $frameurls = $this->_frameurls;
                        $this->_frameurls = array();
                        
                        while(list(,$frameurl) = each($frameurls))
                        {                                                        
                            if($this->_framedepth < $this->maxframes)
                            {
                                $this->fetch($frameurl);
                                $this->_framedepth++;
                            }
                            else
                                break;
                        }
                    }                    
                    
                }
                else
                {
                    return false;
                }
                return true;                    
                break;
            case "https":
                if(!$this->curl_path)
                    return false;
                if(function_exists("is_executable"))
                    if (!is_executable($this->curl_path))
                        return false;
                $this->host = $URI_PARTS["host"];
                if(!empty($URI_PARTS["port"]))
                    $this->port = $URI_PARTS["port"];
                if($this->_isproxy)
                {
                    // using proxy, send entire URI
                    $this->_httpsrequest($URI, $URI, $this->_submit_method, $this->_submit_type, $postdata);
                }
                else
                {
                    $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
                    // no proxy, send only the path
                    $this->_httpsrequest($path, $URI, $this->_submit_method, $this->_submit_type, $postdata);
                }

                if($this->_redirectaddr)
                {
                    /* url was redirected, check if we've hit the max depth */
                    if($this->maxredirs > $this->_redirectdepth)
                    {                        
                        if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
                            $this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);                        

                        // only follow redirect if it's on this site, or offsiteok is true
                        if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
                        {
                            /* follow the redirect */
                            $this->_redirectdepth++;
                            $this->lastredirectaddr=$this->_redirectaddr;
                            if( strpos( $this->_redirectaddr, "?" ) > 0 )
                                $this->fetch($this->_redirectaddr); // the redirect has changed the request method from post to get
                            else
                                $this->submit($this->_redirectaddr,$formvars, $formfiles);
                        }
                    }
                }

                if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
                {
                    $frameurls = $this->_frameurls;
                    $this->_frameurls = array();

                    while(list(,$frameurl) = each($frameurls))
                    {                                                        
                        if($this->_framedepth < $this->maxframes)
                        {
                            $this->fetch($frameurl);
                            $this->_framedepth++;
                        }
                        else
                            break;
                    }
                }                    
                return true;                    
                break;
                
            default:
                // not a valid protocol
                $this->error    =    'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
                return false;
                break;
        }        
        return true;
    }

/*======================================================================*\
    Function:    fetchlinks
    Purpose:    fetch the links from a web page
    Input:        $URI    where you are fetching from
    Output:        $this->results    an array of the URLs
\*======================================================================*/

    function fetchlinks($URI)
    {
        if ($this->fetch($URI))
        {            
            if($this->lastredirectaddr)
                $URI = $this->lastredirectaddr;
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                    $this->results[$x] = $this->_striplinks($this->results[$x]);
            }
            else
                $this->results = $this->_striplinks($this->results);

            if($this->expandlinks)
                $this->results = $this->_expandlinks($this->results, $URI);
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:    fetchform
    Purpose:    fetch the form elements from a web page
    Input:        $URI    where you are fetching from
    Output:        $this->results    the resulting html form
\*======================================================================*/

    function fetchform($URI)
    {
        
        if ($this->fetch($URI))
        {            

            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                    $this->results[$x] = $this->_stripform($this->results[$x]);
            }
            else
                $this->results = $this->_stripform($this->results);
            
            return true;
        }
        else
            return false;
    }
    
    
/*======================================================================*\
    Function:    fetchtext
    Purpose:    fetch the text from a web page, stripping the links
    Input:        $URI    where you are fetching from
    Output:        $this->results    the text from the web page
\*======================================================================*/

    function fetchtext($URI)
    {
        if($this->fetch($URI))
        {            
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                    $this->results[$x] = $this->_striptext($this->results[$x]);
            }
            else
                $this->results = $this->_striptext($this->results);
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:    submitlinks
    Purpose:    grab links from a form submission
    Input:        $URI    where you are submitting from
    Output:        $this->results    an array of the links from the post
\*======================================================================*/

    function submitlinks($URI, $formvars="", $formfiles="")
    {
        if($this->submit($URI,$formvars, $formfiles))
        {            
            if($this->lastredirectaddr)
                $URI = $this->lastredirectaddr;
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                {
                    $this->results[$x] = $this->_striplinks($this->results[$x]);
                    if($this->expandlinks)
                        $this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
                }
            }
            else
            {
                $this->results = $this->_striplinks($this->results);
                if($this->expandlinks)
                    $this->results = $this->_expandlinks($this->results,$URI);
            }
            return true;
        }
        else
            return false;
    }

/*======================================================================*\
    Function:    submittext
    Purpose:    grab text from a form submission
    Input:        $URI    where you are submitting from
    Output:        $this->results    the text from the web page
\*======================================================================*/

    function submittext($URI, $formvars = "", $formfiles = "")
    {
        if($this->submit($URI,$formvars, $formfiles))
        {            
            if($this->lastredirectaddr)
                $URI = $this->lastredirectaddr;
            if(is_array($this->results))
            {
                for($x=0;$x<count($this->results);$x++)
                {
                    $this->results[$x] = $this->_striptext($this->results[$x]);
                    if($this->expandlinks)
                        $this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
                }
            }
            else
            {
                $this->results = $this->_striptext($this->results);
                if($this->expandlinks)
                    $this->results = $this->_expandlinks($this->results,$URI);
            }
            return true;
        }
        else
            return false;
    }

    

/*======================================================================*\
    Function:    set_submit_multipart
    Purpose:    Set the form submission content type to
                multipart/form-data
\*======================================================================*/
    function set_submit_multipart()
    {
        $this->_submit_type = "multipart/form-data";
    }

    
/*======================================================================*\
    Function:    set_submit_normal
    Purpose:    Set the form submission content type to
                application/x-www-form-urlencoded
\*======================================================================*/
    function set_submit_normal()
    {
        $this->_submit_type = "application/x-www-form-urlencoded";
    }

    
    

/*======================================================================*\
    Private functions
\*======================================================================*/
    
    
/*======================================================================*\
    Function:    _striplinks
    Purpose:    strip the hyperlinks from an html document
    Input:        $document    document to strip.
    Output:        $match        an array of the links
\*======================================================================*/

    function _striplinks($document)
    {    
        preg_match_all("'<\s*a\s.*?href\s*=\s*            # find <a href=
                        ([\"\'])?                    # find single or double quote
                        (?(1) (.*?)\\1 | ([^\s\>]+))        # if quote found, match up to next matching
                                                    # quote, otherwise match up to next space
                        'isx",$document,$links);
                        

        // catenate the non-empty matches from the conditional subpattern

        while(list($key,$val) = each($links[2]))
        {
            if(!empty($val))
                $match[] = $val;
        }                
        
        while(list($key,$val) = each($links[3]))
        {
            if(!empty($val))
                $match[] = $val;
        }        
        
        // return the links
        return $match;
    }

/*======================================================================*\
    Function:    _stripform
    Purpose:    strip the form elements from an html document
    Input:        $document    document to strip.
    Output:        $match        an array of the links
\*======================================================================*/

    function _stripform($document)
    {    
        preg_match_all("'<\/?(FORM|INPUT|SELECT|TEXTAREA|(OPTION))[^<>]*>(?(2)(.*(?=<\/?(option|select)[^<>]*>[\r\n]*)|(?=[\r\n]*))|(?=[\r\n]*))'Usi",$document,$elements);
        
        // catenate the matches
        $match = implode("\r\n",$elements[0]);
                
        // return the links
        return $match;
    }

    
    
/*======================================================================*\
    Function:    _striptext
    Purpose:    strip the text from an html document
    Input:        $document    document to strip.
    Output:        $text        the resulting text
\*======================================================================*/

    function _striptext($document)
    {
        
        // I didn't use preg eval (//e) since that is only available in PHP 4.0.
        // so, list your entities one by one here. I included some of the
        // more common ones.
                                
        $search = array("'<script[^>]*?>.*?</script>'si",    // strip out javascript
                        "'<[\/\!]*?[^<>]*?>'si",            // strip out html tags
                        "'([\r\n])[\s]+'",                    // strip out white space
                        "'&(quot|#34|#034|#x22);'i",        // replace html entities
                        "'&(amp|#38|#038|#x26);'i",            // added hexadecimal values
                        "'&(lt|#60|#060|#x3c);'i",
                        "'&(gt|#62|#062|#x3e);'i",
                        "'&(nbsp|#160|#xa0);'i",
                        "'&(iexcl|#161);'i",
                        "'&(cent|#162);'i",
                        "'&(pound|#163);'i",
                        "'&(copy|#169);'i",
                        "'&(reg|#174);'i",
                        "'&(deg|#176);'i",
                        "'&(#39|#039|#x27);'",
                        "'&(euro|#8364);'i",                // europe
                        "'&a(uml|UML);'",                    // german
                        "'&o(uml|UML);'",
                        "'&u(uml|UML);'",
                        "'&A(uml|UML);'",
                        "'&O(uml|UML);'",
                        "'&U(uml|UML);'",
                        "'&szlig;'i",
                        );
        $replace = array(    "",
                            "",
                            "\\1",
                            "\"",
                            "&",
                            "<",
                            ">",
                            " ",
                            chr(161),
                            chr(162),
                            chr(163),
                            chr(169),
                            chr(174),
                            chr(176),
                            chr(39),
                            chr(128),
                            "�",
                            "�",
                            "�",
                            "�",
                            "�",
                            "�",
                            "�",
                        );
                    
        $text = preg_replace($search,$replace,$document);
                                
        return $text;
    }

/*======================================================================*\
    Function:    _expandlinks
    Purpose:    expand each link into a fully qualified URL
    Input:        $links            the links to qualify
                $URI            the full URI to get the base from
    Output:        $expandedLinks    the expanded links
\*======================================================================*/

    function _expandlinks($links,$URI)
    {
        
        preg_match("/^[^\?]+/",$URI,$match);

        $match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
        $match = preg_replace("|/$|","",$match);
        $match_part = parse_url($match);
        $match_root =
        $match_part["scheme"]."://".$match_part["host"];
                
        $search = array(     "|^http://".preg_quote($this->host)."|i",
                            "|^(\/)|i",
                            "|^(?!http://)(?!mailto:)|i",
                            "|/\./|",
                            "|/[^\/]+/\.\./|"
                        );
                        
        $replace = array(    "",
                            $match_root."/",
                            $match."/",
                            "/",
                            "/"
                        );            
                
        $expandedLinks = preg_replace($search,$replace,$links);

        return $expandedLinks;
    }

/*======================================================================*\
    Function:    _httprequest
    Purpose:    go get the http data from the server
    Input:        $url        the url to fetch
                $fp            the current open file pointer
                $URI        the full URI
                $body        body contents to send if any (POST)
    Output:        
\*======================================================================*/
    
    function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="")
    {
        $cookie_headers = '';
        if($this->passcookies && $this->_redirectaddr)
            $this->setcookies();
            
        $URI_PARTS = parse_url($URI);
        if(empty($url))
            $url = "/";
        $headers = $http_method." ".$url." ".$this->_httpversion."\r\n";        
        if(!empty($this->agent))
            $headers .= "User-Agent: ".$this->agent."\r\n";
        if(!empty($this->host) && !isset($this->rawheaders['Host'])) {
            $headers .= "Host: ".$this->host;
            if(!empty($this->port))
                $headers .= ":".$this->port;
            $headers .= "\r\n";
        }
        if(!empty($this->accept))
            $headers .= "Accept: ".$this->accept."\r\n";
        if(!empty($this->referer))
            $headers .= "Referer: ".$this->referer."\r\n";
        if(!empty($this->cookies))
        {            
            if(!is_array($this->cookies))
                $this->cookies = (array)$this->cookies;
    
            reset($this->cookies);
            if ( count($this->cookies) > 0 ) {
                $cookie_headers .= 'Cookie: ';
                foreach ( $this->cookies as $cookieKey => $cookieVal ) {
                $cookie_headers .= $cookieKey."=".urlencode($cookieVal)."; ";
                }
                $headers .= substr($cookie_headers,0,-2) . "\r\n";
            } 
        }
        if(!empty($this->rawheaders))
        {
            if(!is_array($this->rawheaders))
                $this->rawheaders = (array)$this->rawheaders;
            while(list($headerKey,$headerVal) = each($this->rawheaders))
                $headers .= $headerKey.": ".$headerVal."\r\n";
        }
        if(!empty($content_type)) {
            $headers .= "Content-type: $content_type";
            if ($content_type == "multipart/form-data")
                $headers .= "; boundary=".$this->_mime_boundary;
            $headers .= "\r\n";
        }
        if(!empty($body))    
            $headers .= "Content-length: ".strlen($body)."\r\n";
        if(!empty($this->user) || !empty($this->pass))    
            $headers .= "Authorization: Basic ".base64_encode($this->user.":".$this->pass)."\r\n";
        
        //add proxy auth headers
        if(!empty($this->proxy_user))    
            $headers .= 'Proxy-Authorization: ' . 'Basic ' . base64_encode($this->proxy_user . ':' . $this->proxy_pass)."\r\n";


        $headers .= "\r\n";
        
        // set the read timeout if needed
        if ($this->read_timeout > 0)
            socket_set_timeout($fp, $this->read_timeout);
        $this->timed_out = false;
        
        fwrite($fp,$headers.$body,strlen($headers.$body));
        
        $this->_redirectaddr = false;
        unset($this->headers);
                        
        while($currentHeader = fgets($fp,$this->_maxlinelen))
        {
            if ($this->read_timeout > 0 && $this->_check_timeout($fp))
            {
                $this->status=-100;
                return false;
            }
                
            if($currentHeader == "\r\n")
                break;
                        
            // if a header begins with Location: or URI:, set the redirect
            if(preg_match("/^(Location:|URI:)/i",$currentHeader))
            {
                // get URL portion of the redirect
                preg_match("/^(Location:|URI:)[ ]+(.*)/i",chop($currentHeader),$matches);
                // look for :// in the Location header to see if hostname is included
                if(!preg_match("|\:\/\/|",$matches[2]))
                {
                    // no host in the path, so prepend
                    $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
                    // eliminate double slash
                    if(!preg_match("|^/|",$matches[2]))
                            $this->_redirectaddr .= "/".$matches[2];
                    else
                            $this->_redirectaddr .= $matches[2];
                }
                else
                    $this->_redirectaddr = $matches[2];
            }
        
            if(preg_match("|^HTTP/|",$currentHeader))
            {
                if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status))
                {
                    $this->status= $status[1];
                }                
                $this->response_code = $currentHeader;
            }
                
            $this->headers[] = $currentHeader;
        }

        $results = '';
        do {
            $_data = fread($fp, $this->maxlength);
            if (strlen($_data) == 0) {
                break;
            }
            $results .= $_data;
        } while(true);

        if ($this->read_timeout > 0 && $this->_check_timeout($fp))
        {
            $this->status=-100;
            return false;
        }
        
        // check if there is a a redirect meta tag
        
        if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]*URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))

        {
            $this->_redirectaddr = $this->_expandlinks($match[1],$URI);    
        }

        // have we hit our frame depth and is there frame src to fetch?
        if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame\s+.*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
        {
            $this->results[] = $results;
            for($x=0; $x<count($match[1]); $x++)
                $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
        }
        // have we already fetched framed content?
        elseif(is_array($this->results))
            $this->results[] = $results;
        // no framed content
        else
            $this->results = $results;
        
        return true;
    }

/*======================================================================*\
    Function:    _httpsrequest
    Purpose:    go get the https data from the server using curl
    Input:        $url        the url to fetch
                $URI        the full URI
                $body        body contents to send if any (POST)
    Output:        
\*======================================================================*/
    
    function _httpsrequest($url,$URI,$http_method,$content_type="",$body="")
    {  
        if($this->passcookies && $this->_redirectaddr)
            $this->setcookies();

        $headers = array();        
                    
        $URI_PARTS = parse_url($URI);
        if(empty($url))
            $url = "/";
        // GET ... header not needed for curl
        //$headers[] = $http_method." ".$url." ".$this->_httpversion;        
        if(!empty($this->agent))
            $headers[] = "User-Agent: ".$this->agent;
        if(!empty($this->host))
            if(!empty($this->port))
                $headers[] = "Host: ".$this->host.":".$this->port;
            else
                $headers[] = "Host: ".$this->host;
        if(!empty($this->accept))
            $headers[] = "Accept: ".$this->accept;
        if(!empty($this->referer))
            $headers[] = "Referer: ".$this->referer;
        if(!empty($this->cookies))
        {            
            if(!is_array($this->cookies))
                $this->cookies = (array)$this->cookies;
    
            reset($this->cookies);
            if ( count($this->cookies) > 0 ) {
                $cookie_str = 'Cookie: ';
                foreach ( $this->cookies as $cookieKey => $cookieVal ) {
                $cookie_str .= $cookieKey."=".urlencode($cookieVal)."; ";
                }
                $headers[] = substr($cookie_str,0,-2);
            }
        }
        if(!empty($this->rawheaders))
        {
            if(!is_array($this->rawheaders))
                $this->rawheaders = (array)$this->rawheaders;
            while(list($headerKey,$headerVal) = each($this->rawheaders))
                $headers[] = $headerKey.": ".$headerVal;
        }
        if(!empty($content_type)) {
            if ($content_type == "multipart/form-data")
                $headers[] = "Content-type: $content_type; boundary=".$this->_mime_boundary;
            else
                $headers[] = "Content-type: $content_type";
        }
        if(!empty($body))    
            $headers[] = "Content-length: ".strlen($body);
        if(!empty($this->user) || !empty($this->pass))    
            $headers[] = "Authorization: BASIC ".base64_encode($this->user.":".$this->pass);
            
        for($curr_header = 0; $curr_header < count($headers); $curr_header++) {
            $safer_header = strtr( $headers[$curr_header], "\"", " " );
            $cmdline_params .= " -H \"".$safer_header."\"";
        }
        
        if(!empty($body))
            $cmdline_params .= " -d \"$body\"";
        
        if($this->read_timeout > 0)
            $cmdline_params .= " -m ".$this->read_timeout;
        
        $headerfile = tempnam($temp_dir, "sno");

        exec($this->curl_path." -k -D \"$headerfile\"".$cmdline_params." \"".escapeshellcmd($URI)."\"",$results,$return);
        
        if($return)
        {
            $this->error = "Error: cURL could not retrieve the document, error $return.";
            return false;
        }
            
            
        $results = implode("\r\n",$results);
        
        $result_headers = file("$headerfile");
                        
        $this->_redirectaddr = false;
        unset($this->headers);
                        
        for($currentHeader = 0; $currentHeader < count($result_headers); $currentHeader++)
        {
            
            // if a header begins with Location: or URI:, set the redirect
            if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader]))
            {
                // get URL portion of the redirect
                preg_match("/^(Location: |URI:)\s+(.*)/",chop($result_headers[$currentHeader]),$matches);
                // look for :// in the Location header to see if hostname is included
                if(!preg_match("|\:\/\/|",$matches[2]))
                {
                    // no host in the path, so prepend
                    $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
                    // eliminate double slash
                    if(!preg_match("|^/|",$matches[2]))
                            $this->_redirectaddr .= "/".$matches[2];
                    else
                            $this->_redirectaddr .= $matches[2];
                }
                else
                    $this->_redirectaddr = $matches[2];
            }
        
            if(preg_match("|^HTTP/|",$result_headers[$currentHeader]))
                $this->response_code = $result_headers[$currentHeader];

            $this->headers[] = $result_headers[$currentHeader];
        }

        // check if there is a a redirect meta tag
        
        if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]*URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
        {
            $this->_redirectaddr = $this->_expandlinks($match[1],$URI);    
        }

        // have we hit our frame depth and is there frame src to fetch?
        if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame\s+.*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
        {
            $this->results[] = $results;
            for($x=0; $x<count($match[1]); $x++)
                $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
        }
        // have we already fetched framed content?
        elseif(is_array($this->results))
            $this->results[] = $results;
        // no framed content
        else
            $this->results = $results;

        unlink("$headerfile");
        
        return true;
    }

/*======================================================================*\
    Function:    setcookies()
    Purpose:    set cookies for a redirection
\*======================================================================*/
    
    function setcookies()
    {
        for($x=0; $x<count($this->headers); $x++)
        {
        if(preg_match('/^set-cookie:[\s]+([^=]+)=([^;]+)/i', $this->headers[$x],$match))
            $this->cookies[$match[1]] = urldecode($match[2]);
        }
    }

    
/*======================================================================*\
    Function:    _check_timeout
    Purpose:    checks whether timeout has occurred
    Input:        $fp    file pointer
\*======================================================================*/

    function _check_timeout($fp)
    {
        if ($this->read_timeout > 0) {
            $fp_status = socket_get_status($fp);
            if ($fp_status["timed_out"]) {
                $this->timed_out = true;
                return true;
            }
        }
        return false;
    }

/*======================================================================*\
    Function:    _connect
    Purpose:    make a socket connection
    Input:        $fp    file pointer
\*======================================================================*/
    
    function _connect(&$fp)
    {
        if(!empty($this->proxy_host) && !empty($this->proxy_port))
            {
                $this->_isproxy = true;
                
                $host = $this->proxy_host;
                $port = $this->proxy_port;
            }
        else
        {
            $host = $this->host;
            $port = $this->port;
        }
    
        $this->status = 0;
        
        if($fp = fsockopen(
                    $host,
                    $port,
                    $errno,
                    $errstr,
                    $this->_fp_timeout
                    ))
        {
            // socket connection succeeded

            return true;
        }
        else
        {
            // socket connection failed
            $this->status = $errno;
            switch($errno)
            {
                case -3:
                    $this->error="socket creation failed (-3)";
                case -4:
                    $this->error="dns lookup failure (-4)";
                case -5:
                    $this->error="connection refused or timed out (-5)";
                default:
                    $this->error="connection failed (".$errno.")";
            }
            return false;
        }
    }
/*======================================================================*\
    Function:    _disconnect
    Purpose:    disconnect a socket connection
    Input:        $fp    file pointer
\*======================================================================*/
    
    function _disconnect($fp)
    {
        return(fclose($fp));
    }

    
/*======================================================================*\
    Function:    _prepare_post_body
    Purpose:    Prepare post body according to encoding type
    Input:        $formvars  - form variables
                $formfiles - form upload files
    Output:        post body
\*======================================================================*/
    
    function _prepare_post_body($formvars, $formfiles)
    {
        settype($formvars, "array");
        settype($formfiles, "array");
        $postdata = '';

        if (count($formvars) == 0 && count($formfiles) == 0)
            return;
        
        switch ($this->_submit_type) {
            case "application/x-www-form-urlencoded":
                reset($formvars);
                while(list($key,$val) = each($formvars)) {
                    if (is_array($val) || is_object($val)) {
                        while (list($cur_key, $cur_val) = each($val)) {
                            $postdata .= urlencode($key)."[]=".urlencode($cur_val)."&";
                        }
                    } else
                        $postdata .= urlencode($key)."=".urlencode($val)."&";
                }
                break;

            case "multipart/form-data":
                $this->_mime_boundary = "Snoopy".md5(uniqid(microtime()));
                
                reset($formvars);
                while(list($key,$val) = each($formvars)) {
                    if (is_array($val) || is_object($val)) {
                        while (list($cur_key, $cur_val) = each($val)) {
                            $postdata .= "--".$this->_mime_boundary."\r\n";
                            $postdata .= "Content-Disposition: form-data; name=\"$key\[\]\"\r\n\r\n";
                            $postdata .= "$cur_val\r\n";
                        }
                    } else {
                        $postdata .= "--".$this->_mime_boundary."\r\n";
                        $postdata .= "Content-Disposition: form-data; name=\"$key\"\r\n\r\n";
                        $postdata .= "$val\r\n";
                    }
                }
                
                reset($formfiles);
                while (list($field_name, $file_names) = each($formfiles)) {
                    settype($file_names, "array");
                    while (list(, $file_name) = each($file_names)) {
                        if (!is_readable($file_name)) continue;

                        $fp = fopen($file_name, "r");
                        $file_content = fread($fp, filesize($file_name));
                        fclose($fp);
                        $base_name = basename($file_name);

                        $postdata .= "--".$this->_mime_boundary."\r\n";
                        $postdata .= "Content-Disposition: form-data; name=\"$field_name\"; filename=\"$base_name\"\r\n\r\n";
                        $postdata .= "$file_content\r\n";
                    }
                }
                $postdata .= "--".$this->_mime_boundary."--\r\n";
                break;
        }

        return $postdata;
    }
}

?>

dedehttpdown.class.php

<?php
/**
 * 织梦HTTP下载类
 *
 * @version        $Id: dedehttpdown.class.php 1 11:42 2010年7月6日Z tianya $
 * @package        DedeCMS.Libraries
 * @copyright      Copyright (c) 2007 - 2010, DesDev, Inc.
 * @license        http://help.dedecms.com/usersguide/license.html
 * @link           http://www.dedecms.com
 */
@set_time_limit(0);

class DedeHttpDown
{
    var $m_url = '';
    var $m_urlpath = '';
    var $m_scheme = 'http';
    var $m_host = '';
    var $m_port = '80';
    var $m_user = '';
    var $m_pass = '';
    var $m_path = '/';
    var $m_query = '';
    var $m_fp = '';
    var $m_error = '';
    var $m_httphead = '';
    var $m_html = '';
    var $m_puthead = '';
    var $BaseUrlPath = '';
    var $HomeUrl = '';
    var $reTry = 0;
    var $JumpCount = 0;

    /**
     *  初始化系统
     *
     * @access    public
     * @param     string    $url   需要下载的地址
     * @return    string
     */
    function PrivateInit($url)
    {
        if($url=='') {
            return ;
        }
        $urls = '';
        $urls = @parse_url($url);
        $this->m_url = $url;
        if(is_array($urls))
        {
            $this->m_host = $urls["host"];
            if(!empty($urls["scheme"]))
            {
                $this->m_scheme = $urls["scheme"];
            }
            if(!empty($urls["user"]))
            {
                $this->m_user = $urls["user"];
            }
            if(!empty($urls["pass"]))
            {
                $this->m_pass = $urls["pass"];
            }
            if(!empty($urls["port"]))
            {
                $this->m_port = $urls["port"];
            }
            if(!empty($urls["path"]))
            {
                $this->m_path = $urls["path"];
            }
            $this->m_urlpath = $this->m_path;
            if(!empty($urls["query"]))
            {
                $this->m_query = $urls["query"];
                $this->m_urlpath .= "?".$this->m_query;
            }
            $this->HomeUrl = $urls["host"];
            $this->BaseUrlPath = $this->HomeUrl.$urls["path"];
            $this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
            $this->BaseUrlPath = preg_replace("/\/$/","",$this->BaseUrlPath);
        }
    }

    /**
     *  重设各参数
     *
     * @access    public
     * @return    void
     */
    function ResetAny()
    {
        $this->m_url = "";
        $this->m_urlpath = "";
        $this->m_scheme = "http";
        $this->m_host = "";
        $this->m_port = "80";
        $this->m_user = "";
        $this->m_pass = "";
        $this->m_path = "/";
        $this->m_query = "";
        $this->m_error = "";
    }

    /**
     *  打开指定网址
     *
     * @access    public
     * @param     string    $url   地址
     * @param     string    $requestType   请求类型
     * @return    string
     */
    function OpenUrl($url,$requestType="GET")
    {
        $this->ResetAny();
        $this->JumpCount = 0;
        $this->m_httphead = Array() ;
        $this->m_html = '';
        $this->reTry = 0;
        $this->Close();

        //初始化系统
        $this->PrivateInit($url);
        $this->PrivateStartSession($requestType);
    }

    /**
     *  转到303重定向网址
     *
     * @access    public
     * @param     string   $url   地址
     * @return    string
     */
    function JumpOpenUrl($url)
    {
        $this->ResetAny();
        $this->JumpCount++;
        $this->m_httphead = Array() ;
        $this->m_html = "";
        $this->Close();

        //初始化系统
        $this->PrivateInit($url);
        $this->PrivateStartSession('GET');
    }

    /**
     *  获得某操作错误的原因
     *
     * @access    public
     * @return    void
     */
    function printError()
    {
        echo "错误信息:".$this->m_error;
        echo "<br/>具体返回头:<br/>";
        foreach($this->m_httphead as $k=>$v){ echo "$k => $v <br/>\r\n"; }
    }

    /**
     *  判别用Get方法发送的头的应答结果是否正确
     *
     * @access    public
     * @return    bool
     */
    function IsGetOK()
    {
        if( preg_match("/^2/",$this->GetHead("http-state")) )
        {
            return TRUE;
        }
        else
        {
            $this->m_error .= $this->GetHead("http-state")." - ".$this->GetHead("http-describe")."<br/>";
            return FALSE;
        }
    }

    /**
     *  看看返回的网页是否是text类型
     *
     * @access    public
     * @return    bool
     */
    function IsText()
    {
        if( preg_match("/^2/",$this->GetHead("http-state")) && preg_match("/text|xml/i",$this->GetHead("content-type")) )
        {
            return TRUE;
        }
        else
        {
            $this->m_error .= "内容为非文本类型或网址重定向<br/>";
            return FALSE;
        }
    }

    /**
     *  判断返回的网页是否是特定的类型
     *
     * @access    public
     * @param     string   $ctype   内容类型
     * @return    string
     */
    function IsContentType($ctype)
    {
        if(preg_match("/^2/",$this->GetHead("http-state"))
        && $this->GetHead("content-type")==strtolower($ctype))
        {    return TRUE; }
        else
        {
            $this->m_error .= "类型不对 ".$this->GetHead("content-type")."<br/>";
            return FALSE;
        }
    }

    /**
     *  用Http协议下载文件
     *
     * @access    public
     * @param     string    $savefilename  保存文件名称
     * @return    string
     */
    function SaveToBin($savefilename)
    {
        if(!$this->IsGetOK())
        {
            return FALSE;
        }
        if(@feof($this->m_fp))
        {
            $this->m_error = "连接已经关闭!"; return FALSE;
        }
        $fp = fopen($savefilename,"w");
        while(!feof($this->m_fp))
        {
            fwrite($fp, fread($this->m_fp, 1024));
        }
        fclose($this->m_fp);
        fclose($fp);
        return TRUE;
    }

    /**
     *  保存网页内容为Text文件
     *
     * @access    public
     * @param     string    $savefilename  保存文件名称
     * @return    string
     */
    function SaveToText($savefilename)
    {
        if($this->IsText())
        {
            $this->SaveBinFile($savefilename);
        }
        else
        {
            return "";
        }
    }

    /**
     *  用Http协议获得一个网页的内容
     *
     * @access    public
     * @return    string
     */
    function GetHtml()
    {
        if(!$this->IsText())
        {
            return '';
        }
        if($this->m_html!='')
        {
            return $this->m_html;
        }
        if(!$this->m_fp||@feof($this->m_fp))
        {
            return '';
        }
        while(!feof($this->m_fp))
        {
            $this->m_html .= fgets($this->m_fp,256);
        }
        @fclose($this->m_fp);
        return $this->m_html;
    }

    /**
     *  开始HTTP会话
     *
     * @access    public
     * @param     string    $requestType    请求类型
     * @return    string
     */
    function PrivateStartSession($requestType="GET")
    {
        if(!$this->PrivateOpenHost())
        {
            $this->m_error .= "打开远程主机出错!";
            return FALSE;
        }
        $this->reTry++;
        if($this->GetHead("http-edition")=="HTTP/1.1")
        {
            $httpv = "HTTP/1.1";
        }
        else
        {
            $httpv = "HTTP/1.0";
        }
        $ps = explode('?',$this->m_urlpath);

        $headString = '';

        //发送固定的起始请求头GET、Host信息
        if($requestType=="GET")
        {
            $headString .= "GET ".$this->m_urlpath." $httpv\r\n";
        }
        else
        {
            $headString .= "POST ".$ps[0]." $httpv\r\n";
        }
        $this->m_puthead["Host"] = $this->m_host;

        //发送用户自定义的请求头
        if(!isset($this->m_puthead["Accept"]))
        {
            $this->m_puthead["Accept"] = "*/*";
        }
        if(!isset($this->m_puthead["User-Agent"]))
        {
            $this->m_puthead["User-Agent"] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)";
        }
        if(!isset($this->m_puthead["Refer"]))
        {
            $this->m_puthead["Refer"] = "http://".$this->m_puthead["Host"];
        }

        foreach($this->m_puthead as $k=>$v)
        {
            $k = trim($k);
            $v = trim($v);
            if($k!=""&&$v!="")
            {
                $headString .= "$k: $v\r\n";
            }
        }
        fputs($this->m_fp, $headString);
        if($requestType=="POST")
        {
            $postdata = "";
            if(count($ps)>1)
            {
                for($i=1;$i<count($ps);$i++)
                {
                    $postdata .= $ps[$i];
                }
            }
            else
            {
                $postdata = "OK";
            }
            $plen = strlen($postdata);
            fputs($this->m_fp,"Content-Type: application/x-www-form-urlencoded\r\n");
            fputs($this->m_fp,"Content-Length: $plen\r\n");
        }

        //发送固定的结束请求头
        //HTTP1.1协议必须指定文档结束后关闭链接,否则读取文档时无法使用feof判断结束
        if($httpv=="HTTP/1.1")
        {
            fputs($this->m_fp,"Connection: Close\r\n\r\n");
        }
        else
        {
            fputs($this->m_fp,"\r\n");
        }
        if($requestType=="POST")
        {
            fputs($this->m_fp,$postdata);
        }

        //获取应答头状态信息
        $httpstas = explode(" ",fgets($this->m_fp,256));
        $this->m_httphead["http-edition"] = trim($httpstas[0]);
        $this->m_httphead["http-state"] = trim($httpstas[1]);
        $this->m_httphead["http-describe"] = "";
        for($i=2;$i<count($httpstas);$i++)
        {
            $this->m_httphead["http-describe"] .= " ".trim($httpstas[$i]);
        }

        //获取详细应答头
        while(!feof($this->m_fp))
        {
            $line = trim(fgets($this->m_fp,256));
            if($line == "")
            {
                break;
            }
            $hkey = "";
            $hvalue = "";
            $v = 0;
            for($i=0;$i<strlen($line);$i++)
            {
                if($v==1)
                {
                    $hvalue .= $line[$i];
                }
                if($line[$i]==":")
                {
                    $v = 1;
                }
                if($v==0)
                {
                    $hkey .= $line[$i];
                }
            }
            $hkey = trim($hkey);
            if($hkey!="")
            {
                $this->m_httphead[strtolower($hkey)] = trim($hvalue);
            }
        }

        //如果连接被不正常关闭,重试
        if(feof($this->m_fp))
        {
            if($this->reTry > 10)
            {
                return FALSE;
            }
            $this->PrivateStartSession($requestType);
        }

        //判断是否是3xx开头的应答
        if(preg_match("/^3/",$this->m_httphead["http-state"]))
        {
            if($this->JumpCount > 3)
            {
                return;
            }
            if(isset($this->m_httphead["location"]))
            {
                $newurl = $this->m_httphead["location"];
                if(preg_match("/^http/i",$newurl))
                {
                    $this->JumpOpenUrl($newurl);
                }
                else
                {
                    $newurl = $this->FillUrl($newurl);
                    $this->JumpOpenUrl($newurl);
                }
            }
            else
            {
                $this->m_error = "无法识别的答复!";
            }
        }
    }

    /**
     *  获得一个Http头的值
     *
     * @access    public
     * @param     string    $headname   头文件名称
     * @return    string
     */
    function GetHead($headname)
    {
        $headname = strtolower($headname);
        return isset($this->m_httphead[$headname]) ? $this->m_httphead[$headname] : '';
    }

    /**
     *  设置Http头的值
     *
     * @access    public
     * @param     string   $skey  键
     * @param     string   $svalue  值
     * @return    string
     */
    function SetHead($skey,$svalue)
    {
        $this->m_puthead[$skey] = $svalue;
    }

    /**
     *  打开连接
     *
     * @access    public
     * @return    bool
     */
    function PrivateOpenHost()
    {
        if($this->m_host=="")
        {
            return FALSE;
        }
        $errno = "";
        $errstr = "";
        $this->m_fp = @fsockopen($this->m_host, $this->m_port, $errno, $errstr,10);
        if(!$this->m_fp)
        {
            $this->m_error = $errstr;
            return FALSE;
        }
        else
        {
            return TRUE;
        }
    }

    /**
     *  关闭连接
     *
     * @access    public
     * @return    void
     */
    function Close()
    {
        @fclose($this->m_fp);
    }

    /**
     *  补全相对网址
     *
     * @access    public
     * @param     string   $surl  需要不全的地址
     * @return    string
     */
    function FillUrl($surl)
    {
        $i = 0;
        $dstr = "";
        $pstr = "";
        $okurl = "";
        $pathStep = 0;
        $surl = trim($surl);
        if($surl=="")
        {
            return "";
        }
        $pos = strpos($surl,"#");
        if($pos>0)
        {
            $surl = substr($surl,0,$pos);
        }
        if($surl[0]=="/")
        {
            $okurl = "http://".$this->HomeUrl.$surl;
        }
        else if($surl[0]==".")
        {
            if(strlen($surl)<=1)
            {
                return "";
            }
            else if($surl[1]=="/")
            {
                $okurl = "http://".$this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
            }
            else
            {
                $urls = explode("/",$surl);
                foreach($urls as $u)
                {
                    if($u=="..")
                    {
                        $pathStep++;
                    }
                    else if($i<count($urls)-1)
                    {
                        $dstr .= $urls[$i]."/";
                    }
                    else
                    {
                        $dstr .= $urls[$i];
                    }
                    $i++;
                }
                $urls = explode("/",$this->BaseUrlPath);
                if(count($urls) <= $pathStep)
                {
                    return "";
                }
                else
                {
                    $pstr = "http://";
                    for($i=0;$i<count($urls)-$pathStep;$i++)
                    {
                        $pstr .= $urls[$i]."/";
                    }
                    $okurl = $pstr.$dstr;
                }
            }
        }
        else
        {
            if(strlen($surl)<7)
            {
                $okurl = "http://".$this->BaseUrlPath."/".$surl;
            }
            else if(strtolower(substr($surl,0,7))=="http://")
            {
                $okurl = $surl;
            }
            else
            {
                $okurl = "http://".$this->BaseUrlPath."/".$surl;
            }
        }
        $okurl = preg_replace("/^(http:\/\/)/i","",$okurl);
        $okurl = preg_replace("/\/{1,}/", "/", $okurl);
        return "http://".$okurl;
    }
}//End Class

 

你可能感兴趣的:(网页抓取)