最近项目中有需要用到京东和苏宁的物流信息,但市场上的第三方接口都是收费的,还就是有些不太稳定,于是一鼓作气写了个“爬虫”,这里做简要说明
语言:php,请求先安装curl扩展
一、苏宁物流
RequestUrl:https://wuliu.suning.com/home/toLogisticsDetail.htm
RequestType:post
RequestData:[‘expressNos’=>$this->expressNo]
$Reqdata = ['expressNos'=>$this->expressNo];
$url = 'https://wuliu.suning.com/home/toLogisticsDetail.htm';
$content = $this->Curl_post($url,$Reqdata);
private function Curl_post($url,$post_data){
$ch = curl_init();
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_URL,$url);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
$result = curl_exec($ch);
$data = str_replace("\"",'"',$result );
$data = json_decode($data,true);
return $data;
}
拿到请求的内容,做如下处理
$content = explode(''
,$content)[1];
$content2 = explode('',$content)[0];
$content2 = $this->compress_html($content2);
private function compress_html($string) {
$string = str_replace("\r", "", $string);
$string = str_replace("\r\n", '', $string); //清除换行符
$string = str_replace("\n", '', $string); //清除换行符
$string = str_replace("\t", '', $string); //清除制表符
$pattern = array("/> *([^ ]*) *","/[\s]+/","//","/\" /","/ \"/","'/\*[^*]*\*/'");
$replace = array(">\\1<"," ","","\"","\"","");
return preg_replace($pattern, $replace, $string);
}
$pattern = '|]*>(.*)|isU';
$s = preg_match_all($pattern, $content2, $matches);
$list = array_chunk($matches[1],2);
foreach ($list as $k=>$v) {
$list[$k]['time'] = $list[$k][0];
$list[$k]['text'] = $list[$k][1];
unset($list[$k][0]);
unset($list[$k][1]);
}
二、京东物流
RequestUrl:https://www.jdwl.com/order/search?waybillCodes=
RequestType:get
$url = 'https://www.jdwl.com/order/search?waybillCodes='.$this->expressNo;
$content = $this->Curl_get($url);
private function Curl_get($url){
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
curl_close($curl);
return $data;
}
拿到请求的内容,做如下处理
$content = explode(''
,$content)[1];
$content2 = explode('',$content)[0];
$content2 = preg_replace("#]*>(.*)#isU", "$1", $content2);
$content2 = $this->compress_html($content2);//这个函数在上面有写
$_li = '|]*>(.*) |isU';
$s = preg_match_all($_li, $content2, $li_);
$list = [];//数组,存放物流信息
foreach ($li_[1] as &$v) {
//s1截取所有日期时间点
$_time = '|]*>(.*)|isU';
preg_match_all($_time, $v, $time_);
$date = $time_[1][0];//日期
unset($time_[1][0]);//去掉日期
$times = $time_[1];//得到当前日期的所有时间点
//加上日期
foreach($times as &$vv){
$vv = $date.' '.$vv;
}
//s2截取所有内容
$_pcon = '|]*>(.*)
|isU';
preg_match_all($_pcon, $v, $pcon_);
$pcons = $pcon_[1];
//s3时间与内容匹配
$reslist = ['time'=>$times,'text'=>$pcons];
$arrs = array();
foreach ($reslist as $key => $value) {
array_push($arrs,$reslist[$key]);
}
$r = $this->array_merge_more(['time','text'],$arrs);
foreach ($r as $v1) {
array_push($list,$v1);
}
}
//主要是相同长度,相同顺序的2组数组按照key值拼接组合
private function array_merge_more($keys,$arrs){
// 检查参数是否正确
if(!$keys || !is_array($keys) || !$arrs || !is_array($arrs) || count($keys)!=count($arrs)){
return array();
}
// 一维数组中最大长度
$max_len = 0;
// 整理数据,把所有一维数组转重新索引
for($i=0,$len=count($arrs); $i<$len; $i++){
$arrs[$i] = array_values($arrs[$i]);
if(count($arrs[$i])>$max_len){
$max_len = count($arrs[$i]);
}
}
// 合拼数据
$result = array();
for($i=0; $i<$max_len; $i++){
$tmp = array();
foreach($keys as $k=>$v){
if(isset($arrs[$k][$i])){
$tmp[$v] = $arrs[$k][$i];
}
}
$result[] = $tmp;
}
return $result;
}
贴两张最终的图
京东:vc50979551335
苏宁:SN2I00045595658
总结:这两个是官网上的内容,主要是准确的截取字符串,和对数组的处理
希望这篇文章对你有帮助,如果有疑问,请私信小编
觉得还不错,赞赏一下