PHP 采集数据 金色-快讯

ini_set("display_errors", "On");
error_reporting(E_ALL | E_STRICT);
require_once(dirname(__FILE__).'/include/phpQuery.class.php');
require_once(dirname(__FILE__).'/include/Db.class.php');
require_once(dirname(__FILE__).'/include/Image.class.php');
require_once(dirname(__FILE__).'/include/Cachefile.class.php');
require_once(dirname(__FILE__).'/include/Folder.class.php');
set_time_limit(0);

/**
 * @param $fetchurl   @url
 * @param $ipArr      @ip
 * @param array $portArr
 * @return mixed
 */
function getSogouContent($fetchurl,$ipArr,$portArr = array()){
    foreach($ipArr as $key=>$ip){
        $port = $portArr[$key];
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $fetchurl);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC); //代理认证模式
        curl_setopt($ch, CURLOPT_PROXY, $ip); //代理服务器地址
        curl_setopt($ch, CURLOPT_PROXYPORT, $port); //代理服务器端口
        //curl_setopt($ch, CURLOPT_PROXYUSERPWD, ":"); //http代理认证帐号,username:password的格式
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); //使用http代理模式
        //curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)');
        $result = curl_exec($ch);
        curl_close($ch);
        if($result){
            return $result;
        }
    }
}

/**
 * 采集代理IP
 */
for($i=1;$i<=3;$i++){
    $fetchIpUrl = 'http://www.xicidaili.com/nn/'.$i;
    phpQuery::newDocumentFile($fetchIpUrl);
    $ipstr        = pq("tr");
    foreach($ipstr as $key=>$iplist) {
        $ip = trim(pq($iplist)->find('td:eq(1)')->text());
        if($ip){
            $ipArr[] = $ip;
            $portArr[] = trim(pq($iplist)->find('td:eq(2)')->text());
        }
    }
}

/** @var @获取最大页数 $fetchurls */
$fetchurls = 'http://www.jinse.com/ajax/lives/getList?search=&id='."1000000".'&flag=down';
$data = getSogouContent($fetchurls,$ipArr,$portArr);
$dataList = json_decode($data,1);
$dataList = $dataList['data'][date('Y-m-d')];
$max_id = $dataList[0]['id'] ;
$max_id = $max_id + 1 ;

$source = 'jinse';

//先取到列表页内的,最新更新内容页URL
$sql="SELECT old_source_url FROM `table`
      WHERE old_source = 'jinse'  ORDER BY old_source_url DESC LIMIT 1  ";
$sofresult = $WEBDB->fetch_first($sql);
$start_id = (int)$sofresult['old_source_url'];

$project = "/home/wwwlogs/jinse/";

for($i=$start_id;$i<=$max_id;$i++){
    $fetchurl = 'http://www.jinse.com/ajax/lives/getList?search=&id='.$i.'&flag=down';
    /** @var @获取数据 $data */
    $data = getSogouContent($fetchurl,$ipArr,$portArr);
    $dataList = json_decode($data,1);
    $dataList = $dataList['data'][date('Y-m-d')];
    $NewDataList = array_reverse($dataList);
    //var_dump($NewDataList);exit;
    foreach($NewDataList as $key=>$list) {
        $old_source_url = $list['id'];
        //检测是否已采集,如果存在。则放弃
        $is_sql="SELECT id FROM `table`  WHERE old_source = 'jinse' AND  old_source_url='$old_source_url'";
        $sofresult = $WEBDB->fetch_first($is_sql);
        if(!$sofresult['id']) {
            $old_id = addslashes($list['id']);
            $topic_id = addslashes($list['topic_id']);
            $live_id = addslashes($list['live_id']);
            $content = addslashes($list['content']);
            $source = addslashes($list['source']);
            $source_url = addslashes($list['source_url']);
            $type = addslashes($list['type']);
            $website = addslashes($list['website']);
            $status = addslashes($list['status']);
            $grade = addslashes($list['grade']);
            $highlight_color = addslashes($list['highlight_color']);
            $link = addslashes($list['link']);
            $link_name = addslashes($list['link_name']);
            $created_by = addslashes($list['created_by']);
            $created_at = addslashes($list['created_at']);
            $updated_at = addslashes($list['updated_at']);
            $publish_time = (int)addslashes($list['publish_time']);
            $up_counts = addslashes($list['up_counts']);
            $down_counts = addslashes($list['down_counts']);
            $images = addslashes($list['images']);
            $old_source = 'jinse';
            $created_at = strtotime($created_at);
            $updated_at = $created_at+rand(60,600);
            //取到设备记录值的ID号
            if($content && $old_id) {
                $sql = "INSERT INTO `table`
                      (`id`, `topic_id`, `live_id`, `content`, `source`, `source_url`, `type`, `website`, `status`,
                      `grade`, `highlight_color`, `link_name`, `link`, `created_by`, `created_at`, `updated_at`,
                       `publish_time`, `up_counts`, `down_counts`, `images`, `old_source`, `old_source_url`)
                VALUES (NULL, '$topic_id', '$live_id', '$content', '$source', '$source_url', '$type', '$website',
                '$status', '$grade', '$highlight_color', '$link_name', '$link', '$created_by', '$created_at',
                '$updated_at', '$publish_time', '$up_counts', '$down_counts', '$images', '$old_source', '$old_source_url');";
                $re = $WEBDB->query($sql);
                $content_id = $WEBDB->insert_id();

                $content = "|fetchurl|".$fetchurl."|is_sql|".$is_sql."|sofresult|".$sofresult."|sql|".$sql."|$content_id|".$content_id;
                if (!is_dir($project)) {
                    @mkdir($project);
                }
                file_put_contents($project . "jinse_".date('Y-m-d H:i') . ".log", '[' . date('Y-m-d H:i:s') . ']' .
                    $content . "\n"."\n", FILE_APPEND);
                if($content_id){
                    $html='已采集原地址为:'.$oldurl.'----
'
; echo $html.='软件标题为:'.$title.'------
'
; } } set_time_limit(0); } } } //} exit;

你可能感兴趣的:(【PHP】采集数据)