demo地址 https://pan.baidu.com/s/1ZOCK3NMQTyGpuR6ewV_7lw
爬虫配置代码 test.php
// GitHub下载方式
require_once __DIR__ . '\autoloader.php';
use phpspider\core\phpspider;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array(
'name' => '糗事百科',
'log_show' => true,
'tasknum' => 1,
//'save_running_state' => true,
'domains' => array(
'sohu.com',
'www.sohu.com'
//'qiushibaike.com',
//'www.qiushibaike.com'
),
'scan_urls' => array(
'http://www.sohu.com/'
//'https://www.qiushibaike.com/'
),
'list_url_regexes' => array(
"http://www.sohu.com/tag/\d+"
//"https://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
),
'content_url_regexes' => array(
"http://www.sohu.com/a/\d+",
//"https://www.qiushibaike.com/article/\d+",
),
'max_try' => 5,
//'proxies' => array(
//'http://H784U84R444YABQD:[email protected]:9010'
//),
'export' => array(
'type' => 'db',
'table' => 'spider',
),
//'export' => array(
// 'type' => 'csv',
// 'file' => '../data/qiushibaike.csv',
//
//'export' => array(
//'type' => 'sql',
//'file' => '../data/qiushibaike.sql',
//'table' => 'content',
//),
// 'export' => array(
// 'type' => 'db',
// 'table' => 'content',
// ),
'db_config' => array(
'host' => '127.0.0.1',
'port' => 3306,
'user' => 'root',
'pass' => 'Wanda2013',
'name' => 'spider',
),
//'queue_config' => array(
//'host' => '127.0.0.1',
//'port' => 6379,
//'pass' => '',
//'db' => 5,
//'prefix' => 'phpspider',
//'timeout' => 30,
//),
'fields' => array(
array(
'name' => "article_title",
'selector' => "//div[contains(@class,'text-title')]//h1",
//'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]",
'required' => true,
),
array(
'name' => "article_author",
'selector' => "//div[contains(@class,'article-info')]//span[contains(@class,'tag')]//a",
'required' => true,
),
array(
'name' => "article_headimg",
'selector' => "//article[@id='mp-editor']//p//img[1]",
//'selector' => "//div[contains(@class,'author')]//a[1]",
'required' => true,
),
array(
'name' => "article_content",
'selector' => "//article[@id='mp-editor']",
//'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]",
'required' => true,
),
array(
'name' => "article_publish_time",
'selector' => "//span[@id='news-time']",
//'selector' => "//div[contains(@class,'author')]//h2",
'required' => true,
),
/*
array(
'name' => "url",
'selector' => "//div[contains(@class,'author')]//h2", // 这里随便设置,on_extract_field回调里面会替换
'required' => true,
),
*/
),
);
$spider = new phpspider($configs);
$spider->start();