php+redis+xPath实现队列爬虫demo

 现使用php+redis 爬取公司信息的爬虫代码,主要用于获取公司信息,公司联系人相关信息,

 主要是根据天眼查 https://www.tianyancha.com/search 来作为源地址.

function pachong($url,Redis $redis){
    $contents=[];
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.113 Safari/537.36");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($ch, CURLOPT_URL, $url);
    $html = curl_exec($ch);
    curl_close($ch);
    var_dump($url);
    
//xPATH解析html
    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $xPath = new DOMXPath($dom);
    //公司信息
    $companys = $xPath->query('//span[3][@class=\'tt hidden\']');
    //跳转地址nextPage
    $url = $xPath->query('//ul[@class=\'pagination\']/li/a[@class=\'num -next\']/@href');
    //如果跳转地址为空 则使用下一地区
    $company =[];
    foreach ($companys as $e) {
        $company[] = $e->nodeValue;
    }
    foreach ($url as $e) {
       $url = $e->nodeValue;
    }
    $redis->set('url',$url);
    //遍历获取自己想要的信息
    foreach ($company as $e) {
        $e = json_decode($e);
        $contents[] = [
            'name' => $e->name,
            'personName' => $e->legalPersonName,
            'phone'=>$e->phoneNum,
            'address'=>$e->regLocation,
            'category'=>$e->categoryStr,
        ];
    }
    return $contents;
}
function main(){
    $redis =new  Redis();
    $redis->connect('127.0.0.1','6379');
    $url='https://www.tianyancha.com/search';
    $companys = [];
    do {
        $list = pachong($url,$redis);
    //需要的公司信息数组集
        $companys=array_merge($companys,$list);
    } while ($url = $redis->get('url')); //不断获得url
}
main();

 

你可能感兴趣的:(web,PHP,爬虫,队列)