利用QueryList采集17173资讯文章

写在开头
1.由于业务需要,需要进行半自动化的游戏资讯采集。17173是国内对标游戏资讯相对丰富与更新比较勤的网站,因此也成了采集目标之一。
2.QueryList是一款开源的渐进式PHP采集框架,上手容易(从入门到采集到数据用了大约半小时,不含后期数据清洗)。

使用框架:Lumen+QueryList
安装QueryList Composer包

composer require jaeger/querylist

routes路由添加

$router->get('/gather','Headline\GatherController@get_content');

传参

pageCount(页数)
type(类型)
handle(中转,'17173')
如:/gather?pageCount=1&type=1&handle=17173

返回结果

{
   msg:采集成功
   code:200
   count:20
}

Controller代码

post('p
//        ageCount'); //获取多少页内容
        $type   =   $request->post('type'); //
        $handle     =   $request->post('handle');
        $pageCount   =   empty($pageCount) ? 1 : $pageCount;
        $service    =   new RulesService();
        try{
            switch ($handle){
                case '17173':
                    $result     =   $service->rules($pageCount,$type);
                    break;
                case '9you':
                    $result     =   $service->nineGameRules($pageCount,$type);
                    break;
                default:
                    $result     =   ['code'=>0,'message'=>'出错了'];
                    break;
            }
            return $result;
        }catch(\Exception $e){

        }

    }
}

Service业务层代码

 1){
            for ($i=1;$i<=$pageCount;$i++){
                $url = 'http://news.17173.com/data/content/list.json?pageSize=10&pageNo='.$i;
                $getRule[$i]   =   $this->rulesString('17173',$url,$type);
                $totalCount    =   $getRule[$i]['totalCount'];
                $titleData[$i]  =   $getRule[$i]['data'];
            }

            foreach ($titleData as $key =>$val){

                    foreach ($val as $k=>$v){
                        for ($i=0;$inull];
                return $data;
            }
            $result     =   $this->getGameContent('17173',$res);
            $data   =   ['totalCount'=>$totalCount,'titleCount'=>count($result),'data'=>$result];
            return $result;
        }

        $url    =   'http://news.17173.com';
        $getRule    =   $this->rulesString('17173',$url);
        $imgData    =   $getRule['imgData'];
        $titleData  =   $getRule['titleData'];


            foreach ($titleData as $key =>$val){
                foreach ($imgData as $k =>$v){
                    if(empty($v['link']) || empty(strstr($v['link'],'com')))
                        unset($imgData[$k]);
                    if($val['link'] == $v['link']){
                        $res[$key]['title'] =   $val['title'];
                        $res[$key]['img']   =   ltrim($v['img'],'/');
                        $res[$key]['link']   =  $v['link'];
                    }
                }
            }
        $result     =   $this->getGameContent('17173',$res);
            if(empty($res)){
                $data   =   ['data'=>null];
                return $data;
            }
//            print_r($result);
        $return     =   $this->cleanData($result['data'],$url,'17173','17173');
//        $data   =   ['titleCount'=>count($result),'data'=>$result];
        return $return;
    }

    /**
     * @param $type  1 首页 2 页数
     * 获取九游新闻列表内容
     */
    public function nineGameRules($page,$type=1)
    {
        $res    =   [];
        //采集页数暂时不可用
//        if($type == 2 && $page > 1){
//            for ($i=1;$i<=$page;$i++){
//                $url    =   'http://www.9game.cn/news/0_'.$i; //最新新闻资讯
//                $getRule[$i]   =   $this->rulesString('9you',$url,$type);
//            }
//            $content    =   $this->getGameContent('9you',$getRule,$type);
//            foreach ($content as $key =>$val){
//                $res[]    =   $val[0];
//            }
//            return $res;
//        }
        $return     =   [];
        $url    =   'http://www.9game.cn/news/0_1';
        $getRule   =   $this->rulesString('9you',$url,$type);
//        print_r($getRule);
        $content    =   $this->getGameContent('9you',$getRule,$type);
        $return     =   $this->cleanData($content,$url,'九游','9you');
        return $return;
    }

    public function rulesString($name,$url,$type=1)
    {
        $result =   [];
        switch ($name){
            case '17173':
                if($type == 2){
                    $ql = QueryList::get($url);
                    $title  =   $ql->getHtml();
                    $result  =   json_decode(json_decode(json_encode($title)),true);
                    return $result;
                }

                $titleRules = [
                    'title' =>['h2','text'],
                    'link' =>['a','href']
                ];
                $range  =   '.text';

                $titleData = QueryList::get($url)->rules($titleRules)->range($range)->queryData();

                $imgRules = [
                    'img'=>['img','src'],
                    'link'=>['a','href']
                ];
                $imgRang='.pic';
                $imgData = QueryList::get($url)->rules($imgRules)->range($imgRang)->queryData();
                $result     =   ['titleData'=>$titleData,'imgData'=>$imgData];
                return $result;
                break;
            case '9you':
                $stringUrl    =   'http://www.9game.cn';
                $titleRules     =   [
                    'title'=>['h2','text'],
                    'link'=>['a','href'],
                    'dates'=>['.time','text']
                ];
                $titleRange     =   '.title';
                $titleData = QueryList::get($url)->rules($titleRules)->range($titleRange)->queryData();
                foreach ($titleData as $key =>$val){
                    $result[$key]['title'] =  $val['title'];
                    $result[$key]['link'] =   $stringUrl.$val['link'];
                    $result[$key]['dates'] =  strtotime(preg_replace('/([\x80-\xff]*)/i','',$val['dates']));
                }

                return $result;
                break;
            default :

                break;
        }
        return $result;
    }

    /**
     * @param (GET)
     * @通过网址获取对应文章内容
     */
    public function getGameContent($name,$data,$type = null)
    {
        switch($name){
            case '17173':

                $result     =   [];
                $rules  =   [
                    "title"=>["h1","text"],
                    "dates"=>[".gb-final-date","text"],
                    "content"=>['#mod_article','html']
                ];
                $range  =   '.gb-final-pn-article';

                foreach ($data as $key =>$val)
                {
                    $result[$key]= QueryList::get($val['link'])->rules($rules)->range($range)->query()->getData();
                    $result[$key]   =   json_decode(json_encode($result[$key]),true);
                }

                $count  =   count($result);
                $res   =   ['count'=>$count,'data'=>$result];
                return $res;
                break;
            case '9you':
                $range  =   '.left-con';
                $rules  =   [
                    'title'=>['.text-title h1','text'],
                    'content'=>['.text-con','html'],
                    'dates'=>['.summary','text']
                ];
                //首页内容
                if($type == 1){
                    foreach ($data as $key =>$val){
                        $result[$key] = QueryList::get($val['link'])->rules($rules)->range($range)->query()->getData();
                        $result[$key] = json_decode(json_encode($result[$key]), true);
                    }
                    foreach($result as $k =>$v){
                        if(!$v)
                            unset($result[$k]);
                    }
                }else{
                    //多页内容
                    foreach ($data as $key => $val) {
                        foreach ($val as $k => $v) {
                            $result[$k] = QueryList::get($v['link'])->rules($rules)->range($range)->query()->getData();
                            $result[$k] = json_decode(json_encode($result[$k]), true);
                        }
                    }
                }

                return $result;
                break;
            default:

                break;
        }
    }

    /**
     * @param $data
     * @param $url
     * @param $author
     * @param $name
     * @return mixed
     * @清洗数据,重新归类
     */

    public function cleanData($data,$url,$author,$name)
    {
        $model  =   new TabHeadlineArticle();

        foreach ($data as $key =>$val) {
            foreach ($val as $get => $datas) {
                $return[$key]['article_title'] = $datas['title'];
                $return[$key]['article_content'] = $datas['content'];
                if($name == '9you'){
                    $return[$key]['article_create_time'] = strtotime(preg_replace('/([\x80-\xff]*)/i', '', $datas['dates']));
                }else{
                    $return[$key]['article_create_time'] = time();
                }
                $return[$key]['article_author'] = $author;
                $return[$key]['article_come'] = $url;
                $return[$key]['article_type'] = 3;
                $return[$key]['article_upload_video'] = 0;
                $return[$key]['article_tags'] = 0;
                $return[$key]['status'] = 0;
                $return[$key]['article_cate_id'] = 0;
                $return[$key]['article_cover_image']    =   !empty($datas['article_cover_image']) ?$datas['article_cover_image']:0;
                $return[$key]['md5']    =   md5($datas['title'].$author);

            }
        }
            //检查表中是否已经存在相同的标题,如果是,则删除数组中的

        $checkData  =   $this->checkTitle($return);

        //如果数据全部重复,则为false
        if($checkData == false){
            $res =  ['msg'=>'暂时没有新的数据','code'=>0];
            DB::table('tab_headline_article_gather')->truncate();
            //TabHeadlineArticleGather::where(['status','=',0])->update(['status'=>1]);
            return $res;
        }
        //清理重复数据后,直接插入
        if(is_array($checkData)){
            TabHeadlineArticle::insert($checkData);
            $res    =   ['code'=>200,'count'=>count($return),'msg'=>'采集成功'];
            DB::table('tab_headline_article_gather')->truncate();
            return $res;
        }
    }

    /**
     * 检测是否已经存在文章或标题
     * 写入采集表
     * 读取采集表
     * 对比完成,写入article表
     * 删除采集表的内容
     */
    public function checkTitle($data)
    {

        $findSample = [];
         TabHeadlineArticleGather::insert($data);
        //采集的数据先写入gather表
        $gather     =  DB::table('tab_headline_article_gather')->where('status','=',0)->get();
        $gather     =   json_decode($gather,true);

       $article     =   DB::table('tab_headline_article')->where('status',0)->get();
        $article    =   json_decode($article,true);

        //否则应该处理掉重复的标题数组,再返回
        foreach ($article as $k =>$v){
            foreach ($gather as $get=>$datum){
                if($v['md5'] == $datum['md5']){
                  unset($gather[$get]);
                }

            }
        }
        if(empty($gather)){
            return false; //删除后如果为空,则返回false
        }
        return $gather;

    }

}

你可能感兴趣的:(利用QueryList采集17173资讯文章)