把要爬取的网页的url放到一个数组里,本文以爬取豆瓣租房信息为例,在豆瓣搜索租房查看网页布局后再参考代码更好,具体代码如下:
$j = -25;
for ($i = 0; $i < ‘要爬取前几页的页数’; $i++) {
$get_start = $j = $j +25;
$urls[$i] = "https://www.douban.com/group/".‘城市分组名’."/discussion?start=".$get_start;
}
$mh = curl_multi_init();
$user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";
foreach ($urls as $i => $url) {
$conn[$i] = curl_init($url);
curl_setopt_array($conn[$i], array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 60,
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSLVERSION => 'all',
CURLOPT_CUSTOMREQUEST => "GET",
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HEADER => 0,
CURLOPT_USERAGENT => $user_agent,
CURLOPT_REFERER => 'https://www.douban.com/',
CURLOPT_HTTPHEADER => array("Cache-Control: no-cache", "Postman-Token: 17957144-bad2-42f2-99b9-50d2f313c37f", "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"),
));
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
curl_multi_exec($mh,$active);
} while ($active);
$fields = array('title', 'author', 'huiying', 'end_time', 'code', 'type');
$data = array();
foreach ($urls as $i => $url) {
$response = curl_multi_getcontent($conn[$i]); // 获得爬取的代码字符串
//获取行 strip_tags
$regex_tr = "/.*?<\/tr>/ism";
preg_match_all($regex_tr,$response,$matchs_tr,PREG_SET_ORDER);
foreach ($matchs_tr as $k => $v) {
//匹配其中的td
$regex_td = "/.*?<\/td>/ism";
preg_match_all($regex_td,$v['0'],$matchs_td,PREG_SET_ORDER);
//获取超链接内的title,此title为完整标题
$preg_title='//is';
preg_match_all($preg_title,$matchs_td['0']['0'],$matchs_title, PREG_SET_ORDER);
//匹配title超链接的内容
$preg_href='//is';
preg_match_all($preg_href,$matchs_td['0']['0'],$matchs_href, PREG_SET_ORDER);
//转为数组
$href_arr = explode('/', $matchs_href['0']['1']);
//待添加数组
$data[] = array(
$matchs_title['0']['1'], //标题
strip_tags($matchs_td['1']['0']), //作者
strip_tags($matchs_td['2']['0']), //回应
strtotime('2019-'.strip_tags($matchs_td['3']['0'])), //最后回应时间
$href_arr['5'], //code
$city_name, //所属城市
);
} //内foreach
} //外foreach
//获取的内容已在待添加数组$data中,在此下面可做数据库操作
//......
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
}
curl_multi_close($mh);