php使用QueryList爬取数据并保存到mysql数据库

QueryList4.0爬取数据

1:准备工作
QueryList4.0是基于composer安装的,so首先要确保composer安装成功,之后安装QueryList4.0。另外要会用QueryList选择器。

2:爬取数据地址:腾讯2019年校招
php使用QueryList爬取数据并保存到mysql数据库_第1张图片
3:思路
根据链接1,获取到左导航栏2里的链接;
遍历2中获取的url,获取选择栏3里的url链接;
遍历3中获取的url,得到下拉框4中的url;
根据下拉框的url得到’岗位描述’等的数据.


AnalogLanding.php爬取数据文件


namespace Controllers;
require 'vendor/autoload.php';
require 'controllers/MysqlClass.php';//引入存储数据文件

use Jaeger\GHttp;
use QL\QueryList;
class AnalogLanding
{

    public $link="localhost";

    /**
     * [getUrl 获取url]
     * @param  [type] $url   [需要获取url路径的地址]
     * @param  [type] $rules [获取a标签的href数据规则]
     * @return [type]        [description]
     */
    public function getUrl($url,$rules)
    {
        $url_array = [];

        $data = QueryList::get($url)->rules($rules)->query()->getData();
        $url_data = $data->all();

        foreach ($url_data as $key => $value) {
            $url_array[] = "https://join.qq.com/".$value['url_href'];//可变参数
        }
        return $url_array;
    }

    /**
     * [getInsertData 获取爬虫数据]
     * @return [type] [description]
     */
    public function getData($totle_url)
    {
        $left_rules = [
            'url_href' => [".left-nav ul:eq(0) [href!='#']",'href'],//获取左导航栏的a标签的href数据
        ];
        $top_rules = [
            'url_href' => [".technology-content ul.tab [href!='#']",'href'],//获取上导航栏的a标签的href数据
        ];
        $min_rules = [
            'url_href' => [".technology-content .technology-con [href!='#']",'href'],//获取下拉选的规则
        ];
        //获取页面数据的规则
        $rules = [
                'title' => ['.page-content .list-section .select span.selected','text'],
                'city' => ['.technology-content .item .contxt:eq(2)','text'],
                // 'experience' => ['','text'],//不限
                // 'salary' => ['','text'],//面议
                // 'education' => ['','text'],//工作学历要求
                // 'publictime' => ['','text'],//工作信息发布时间
                'details' => ['.technology-content .item .contxt:eq(0)','text'],
                // 'neednum' => ['.technology-content .item .contxt:eq(1)','html'],
                // 'company' => ['','text'],//腾讯
                'jobtype' => ['.technology-content ul.tab li.active','text'],
                // 'url' => ['','text']
            ];
        $left_url_all = $this->getUrl($totle_url,$left_rules);

        foreach ($left_url_all as $key => $value) {
           $top_url_all[] = $this->getUrl($value,$top_rules);

        }
        foreach ($top_url_all as $k => $v) {

           foreach ($v as $k_top => $v_top) {
                $res_url = $this->getUrl($v_top,$min_rules);
                if (empty($res_url)) {
                    $min_url_all[][] = $v_top;
                }else{
                    $min_url_all[] = $res_url;
                }


            }
        }
        //三维数组变一维数组
        foreach ($min_url_all as $key => $value) {
            foreach ($value as $k => $v) {
                $url_all[$key] = $v;
            }

        }
        foreach ($url_all as $key => $value) {

            $data = QueryList::get($value)->rules($rules)->query()->getData();

            $res_data[$key] = $data->all()[0];
            $res_data[$key]['url'] = $value;
        }
        foreach ($res_data as $key => $value) {
            $res_data[$key]['experience'] = '不限';
            $res_data[$key]['salary'] = '面议';
            $res_data[$key]['education'] = '不限';
            $res_data[$key]['publictime'] = '暂无';
            $res_data[$key]['company'] = '腾讯';
            $res_data[$key]['neednum'] = 0;//没有规定

        }

        return $res_data;
    }


    /**
     * [getInsertData 存储数据]
     * @return [type] [description]
     */
    public function getInsertData()
    {
        $totle_url = "https://join.qq.com/post.php?tid=2&pid=1";
        $data = $this->getData($totle_url);//获取爬虫数据
        $string = '';
        $sql = '';
        $link = $this->link;
        foreach ($data as $key => $value) {

            $string .= "('".$value['title']."',";
            $string .= "'".$value['city']."',";
            $string .= "'".$value['details']."',";
            $string .= "'".$value['neednum']."',";
            $string .= "'".$value['jobtype']."',";
            $string .= "'".$value['experience']."',";
            $string .= "'".$value['salary']."',";
            $string .= "'".$value['education']."',";
            $string .= "'".$value['publictime']."',";
            $string .= "'".$value['company']."',";
            $string .= "'".$value['url']."'),";
        }
        $sql_string = rtrim($string,',');
        $sql = "INSERT INTO jobtable 
 (`title`,`city`, `details`, `neednum`, `jobtype`, `experience`, `salary`, `education`, `publictime`, `company`, `url`)
 VALUES " . $sql_string;
        $MysqlClass = new MysqlClass();
        $new = $MysqlClass->getInsert($sql,$link);
        return $new;
    }

}
    $AnalogLanding = new AnalogLanding();

    $res = $AnalogLanding->getInsertData();
    print_r($res);

MysqlClass.php数据库操作文件


namespace Controllers;
/**
* 数据库存储数据
*/
class MysqlClass
{
    public function getConnect($link)
    {
        $res = mysqli_connect($link, "root", "root", "search");
        return $res;
    }

    /**
     * [getInsert 插入数据]
     * @param  [type] $sql  [需要执行的sql]
     * @param  [type] $link [链接]
     * @return [type]       [description]
     */
    public function getInsert($sql,$link)
    {

        $res = $this->getConnect($link);
        if (mysqli_connect_errno($res)) {
            $error =  "连接 MySQL 失败: " . mysqli_connect_error(); 
            return $error;

        }else{
            mysqli_set_charset ($res,"utf8");//设置连接时候的编码
            $obj = mysqli_query($res,$sql);
            if ($obj===true) {
                 $data = "新记录插入成功";  
            }else{
                $data = "插入失败";//只允许插入一次
            }
            // $arr = mysqli_fetch_all($obj,MYSQLI_ASSOC);//获取所有数据
            return $data;
        }
        $this->getClose();
    }

    public function getClose()
    {
        $res = $this->getConnect();
        mysqli_close($res);
    }


}
//  $new = new MysqlClass();
//  $mysql = $new->getInsert("INSERT INTO menu 
// (`id`,`name`, `parent`, `route`, `order`)
// VALUES
//  (87,'sss', null, 'ji','77')");
//     print_r($mysql);exit;
?>

你可能感兴趣的:(php)