PHP cURL使用多线程爬取网页内容

把要爬取的网页的url放到一个数组里,本文以爬取豆瓣租房信息为例,在豆瓣搜索租房查看网页布局后再参考代码更好,具体代码如下:

$j = -25;
for ($i = 0; $i < ‘要爬取前几页的页数’; $i++) {
    $get_start = $j = $j +25;
    $urls[$i] = "https://www.douban.com/group/".‘城市分组名’."/discussion?start=".$get_start;
}

$mh = curl_multi_init();
$user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";
foreach ($urls as $i => $url) {
    $conn[$i] = curl_init($url);
    curl_setopt_array($conn[$i], array(
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_ENCODING => "",
        CURLOPT_MAXREDIRS => 10,
        CURLOPT_TIMEOUT => 60,
        CURLOPT_SSL_VERIFYPEER => FALSE,
        CURLOPT_SSL_VERIFYHOST => FALSE,
        CURLOPT_SSLVERSION => 'all',
        CURLOPT_CUSTOMREQUEST => "GET",
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_HEADER => 0,

        CURLOPT_USERAGENT => $user_agent,
        CURLOPT_REFERER => 'https://www.douban.com/',

        CURLOPT_HTTPHEADER => array("Cache-Control: no-cache",    "Postman-Token: 17957144-bad2-42f2-99b9-50d2f313c37f",    "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"),
    ));

    curl_multi_add_handle ($mh,$conn[$i]);
}

do {
    curl_multi_exec($mh,$active);
} while ($active);

$fields = array('title', 'author', 'huiying', 'end_time', 'code', 'type');
$data = array();
foreach ($urls as $i => $url) {
    $response = curl_multi_getcontent($conn[$i]); // 获得爬取的代码字符串

    //获取行 strip_tags
    $regex_tr = "/.*?<\/tr>/ism";
    preg_match_all($regex_tr,$response,$matchs_tr,PREG_SET_ORDER);

    foreach ($matchs_tr as $k => $v) {
        //匹配其中的td
        $regex_td = "/.*?<\/td>/ism";
        preg_match_all($regex_td,$v['0'],$matchs_td,PREG_SET_ORDER);

        //获取超链接内的title,此title为完整标题
        $preg_title='//is';
        preg_match_all($preg_title,$matchs_td['0']['0'],$matchs_title, PREG_SET_ORDER);

        //匹配title超链接的内容
        $preg_href='//is';
        preg_match_all($preg_href,$matchs_td['0']['0'],$matchs_href, PREG_SET_ORDER);

        //转为数组
        $href_arr = explode('/', $matchs_href['0']['1']);

        //待添加数组
        $data[] = array(
            $matchs_title['0']['1'],            //标题
            strip_tags($matchs_td['1']['0']),   //作者
            strip_tags($matchs_td['2']['0']),   //回应
            strtotime('2019-'.strip_tags($matchs_td['3']['0'])),  //最后回应时间
            $href_arr['5'],                     //code
            $city_name,                         //所属城市
        );

    }  //内foreach

} //外foreach

//获取的内容已在待添加数组$data中,在此下面可做数据库操作
//......

foreach ($urls as $i => $url) {
    curl_multi_remove_handle($mh,$conn[$i]);
    curl_close($conn[$i]);
}

curl_multi_close($mh);

你可能感兴趣的:(php,cURL,demo,多线程)