菜鸟,经过半天的摸索,终于找到抓取苹果商店应用评论的方法了
1.抓取以下信息的URL = https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current;
2.抓取评论的URL = https://itunes.apple.com/WebObjects/MZStore.woa/wa/userReviewsRow?id=442673238&displayable-kind=11&startIndex=0&endIndex=100&sort=1&appVersion=current;
注意:以上只要笔记为红色的部分,是可变部分,其中:id是苹果商店的id,appVersion是指抓取的评论是当前版本还是所有有两个值(current、all)
startIndex,endIndex是分页抓取
我使用的方法是用php模拟抓取,记得http头信息一定加上。
不喜欢废话,我的php代码大概是这样的,只是提供部分代码,大概逻辑就是这样的
//拿评论总量 及 评分 $remote_url = "https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current"; $util = new Utils(); $httpheader = array("iTunes/11.1.3 (Windows; Microsoft Windows 7 x64 Ultimate Edition (Build 7600)) AppleWebKit/536.30.1", "Accept-Language:zh-CN,zh;q=0.8", "X-Apple-Store-Front:143465-19,17", "X-Apple-Tz:28800", "Host:itunes.apple.com", "Accept-Encoding: gzip;q=1.0, identity; q=0.5, *;q=0" ); $ret = $util->curlPost($remote_url,array(),100,false,$httpheader); if(!empty($ret['error'])){ return false; } $totalContent = $ret['content']; // $totalContent_arr = explode('Cache-Control: public,no-transform,max-age=60', $totalContent); // $totalContent = trim($totalContent_arr[1]); $start = strpos($totalContent, '{'); $totalContent = substr($totalContent, $start); $total_arr = json_decode($totalContent,true); if($total_arr['totalNumberOfReviews'] === 0){ return true; } if(!$total_arr['totalNumberOfReviews']){ return false; } $totalComment = $total_arr['totalNumberOfReviews'];
其中curlPost 方法为
function curlPost($url, $data = array(), $timeout = 30, $CA = false, $httpheader = null){ $cacert = getcwd() . '/cacert.pem'; //CA根证书 $SSL = substr($url, 0, 8) == "https://" ? true : false; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout-2); if ($SSL && $CA) { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); // 只信任CA颁布的证书 curl_setopt($ch, CURLOPT_CAINFO, $cacert); // CA根证书(用来验证的网站证书是否是CA颁布) curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); // 检查证书中是否设置域名,并且是否与提供的主机名匹配 } else if ($SSL && !$CA) { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书 curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1); // 检查证书中是否设置域名 } curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:')); //避免data数据过长问题 curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); curl_setopt($ch, CURLOPT_HEADER, 1); /*$httpheader = array( "User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)", "Accept-Language: zh-CN,zh;q=0.8" );*/ if(!empty($httpheader)){ curl_setopt($ch, CURLOPT_HTTPHEADER, $httpheader); } //curl_setopt($ch, CURLOPT_NOBODY, 1); //页面内容我们并不需要 //curl_setopt($ch, CURLOPT_HEADER, 1); //只需返回HTTP header $result = array(); $result['content'] = curl_exec($ch); //返回的数组中包括了以下信息:“url” //资源网络地址;“content_type” //内容编码;“http_code” //HTTP状态码;“header_size” //header的大小;“request_size” //请求的大小; //“filetime” //文件创建时间;“ssl_verify_result” //SSL验证结果;“redirect_count” //跳转技术;“total_time” //总耗时;“namelookup_time” //DNS查询耗时; //“connect_time” //等待连接耗时;“pretransfer_time” //传输前准备耗时;“size_upload” //上传数据的大小;“size_download” //下载数据的大小 //“speed_download” //下载速度;“speed_upload” //上传速度;“download_content_length”//下载内容的长度;“upload_content_length” //上传内容的长度l; //“starttransfer_time” //开始传输的时间;“redirect_time”//重定向耗时 $result['info'] = curl_getinfo($ch); $result['error'] = curl_error($ch); curl_close($ch); return $result; }