N phpspider爬虫获取网站内容demo

demo地址 https://pan.baidu.com/s/1ZOCK3NMQTyGpuR6ewV_7lw

N phpspider爬虫获取网站内容demo_第1张图片
N phpspider爬虫获取网站内容demo_第2张图片
更多Xpath语法详见W3C

爬虫配置代码 test.php

// GitHub下载方式
require_once __DIR__ . '\autoloader.php';
use phpspider\core\phpspider;

/* Do NOT delete this comment */
/* 不要删除这段注释 */

$configs = array(
    'name' => '糗事百科',
    'log_show' => true,
    'tasknum' => 1,
    //'save_running_state' => true,

    'domains' => array(
        'sohu.com',
        'www.sohu.com'
        //'qiushibaike.com',
        //'www.qiushibaike.com'
    ),
    'scan_urls' => array(
        'http://www.sohu.com/'
        //'https://www.qiushibaike.com/'
    ),
    'list_url_regexes' => array(
        "http://www.sohu.com/tag/\d+"
        //"https://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
    ),
    'content_url_regexes' => array(
        "http://www.sohu.com/a/\d+",
        //"https://www.qiushibaike.com/article/\d+",
    ),
    'max_try' => 5,
    //'proxies' => array(
        //'http://H784U84R444YABQD:[email protected]:9010'
    //),
    'export' => array(
          'type' => 'db', 
          'table' => 'spider',
    ),
    //'export' => array(
    //    'type' => 'csv',
    //    'file' => '../data/qiushibaike.csv',
    //
    //'export' => array(
        //'type'  => 'sql',
        //'file'  => '../data/qiushibaike.sql',
        //'table' => 'content',
    //),
//     'export' => array(
//         'type' => 'db', 
//         'table' => 'content',
//     ),
    'db_config' => array(
        'host'  => '127.0.0.1',
        'port'  => 3306,
        'user'  => 'root',
        'pass'  => 'Wanda2013',
        'name'  => 'spider',
    ),
    //'queue_config' => array(
        //'host'      => '127.0.0.1',
        //'port'      => 6379,
        //'pass'      => '',
        //'db'        => 5,
        //'prefix'    => 'phpspider',
        //'timeout'   => 30,
    //),


    'fields' => array(
        array(
            'name' => "article_title",
            'selector' => "//div[contains(@class,'text-title')]//h1",
            //'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]",
            'required' => true,
        ),
        array(
            'name' => "article_author",
            'selector' => "//div[contains(@class,'article-info')]//span[contains(@class,'tag')]//a",
            'required' => true,
        ),
        array(
            'name' => "article_headimg",
            'selector' => "//article[@id='mp-editor']//p//img[1]",
            //'selector' => "//div[contains(@class,'author')]//a[1]",
            'required' => true,
        ),
        array(
            'name' => "article_content",
            'selector' => "//article[@id='mp-editor']",
            //'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]",
            'required' => true,
        ),
        array(
            'name' => "article_publish_time",
            'selector' => "//span[@id='news-time']",
            //'selector' => "//div[contains(@class,'author')]//h2",
            'required' => true,
        ),
        /*
        array(
            'name' => "url",
            'selector' => "//div[contains(@class,'author')]//h2",   // 这里随便设置,on_extract_field回调里面会替换
            'required' => true,
        ),
        */
    ),
);

$spider = new phpspider($configs);

$spider->start();



你可能感兴趣的:(PHP代码积累)