菜鸟,经过半天的摸索,终于找到抓取苹果商店应用评论的方法了
1.抓取以下信息的URL = https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current;
2.抓取评论的URL = https://itunes.apple.com/WebObjects/MZStore.woa/wa/userReviewsRow?id=442673238&displayable-kind=11&startIndex=0&endIndex=100&sort=1&appVersion=current;
注意:以上只要笔记为红色的部分,是可变部分,其中:id是苹果商店的id,appVersion是指抓取的评论是当前版本还是所有有两个值(current、all)
startIndex,endIndex是分页抓取
我使用的方法是用php模拟抓取,记得http头信息一定加上。
不喜欢废话,我的php代码大概是这样的,只是提供部分代码,大概逻辑就是这样的
//拿评论总量 及 评分
$remote_url = "https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current";
$util = new Utils();
$httpheader = array("iTunes/11.1.3 (Windows; Microsoft Windows 7 x64 Ultimate Edition (Build 7600)) AppleWebKit/536.30.1",
"Accept-Language:zh-CN,zh;q=0.8",
"X-Apple-Store-Front:143465-19,17",
"X-Apple-Tz:28800",
"Host:itunes.apple.com",
"Accept-Encoding: gzip;q=1.0, identity; q=0.5, *;q=0"
);
$ret = $util->curlPost($remote_url,array(),100,false,$httpheader);
if(!empty($ret['error'])){
return false;
}
$totalContent = $ret['content'];
// $totalContent_arr = explode('Cache-Control: public,no-transform,max-age=60', $totalContent);
// $totalContent = trim($totalContent_arr[1]);
$start = strpos($totalContent, '{');
$totalContent = substr($totalContent, $start);
$total_arr = json_decode($totalContent,true);
if($total_arr['totalNumberOfReviews'] === 0){
return true;
}
if(!$total_arr['totalNumberOfReviews']){
return false;
}
$totalComment = $total_arr['totalNumberOfReviews'];
其中curlPost 方法为
function curlPost($url, $data = array(), $timeout = 30, $CA = false, $httpheader = null){
$cacert = getcwd() . '/cacert.pem'; //CA根证书
$SSL = substr($url, 0, 8) == "https://" ? true : false;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout-2);
if ($SSL && $CA) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); // 只信任CA颁布的证书
curl_setopt($ch, CURLOPT_CAINFO, $cacert); // CA根证书(用来验证的网站证书是否是CA颁布)
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); // 检查证书中是否设置域名,并且是否与提供的主机名匹配
} else if ($SSL && !$CA) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1); // 检查证书中是否设置域名
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:')); //避免data数据过长问题
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_HEADER, 1);
/*$httpheader = array(
"User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
"Accept-Language: zh-CN,zh;q=0.8"
);*/
if(!empty($httpheader)){
curl_setopt($ch, CURLOPT_HTTPHEADER, $httpheader);
}
//curl_setopt($ch, CURLOPT_NOBODY, 1); //页面内容我们并不需要
//curl_setopt($ch, CURLOPT_HEADER, 1); //只需返回HTTP header
$result = array();
$result['content'] = curl_exec($ch);
//返回的数组中包括了以下信息:“url” //资源网络地址;“content_type” //内容编码;“http_code” //HTTP状态码;“header_size” //header的大小;“request_size” //请求的大小;
//“filetime” //文件创建时间;“ssl_verify_result” //SSL验证结果;“redirect_count” //跳转技术;“total_time” //总耗时;“namelookup_time” //DNS查询耗时;
//“connect_time” //等待连接耗时;“pretransfer_time” //传输前准备耗时;“size_upload” //上传数据的大小;“size_download” //下载数据的大小
//“speed_download” //下载速度;“speed_upload” //上传速度;“download_content_length”//下载内容的长度;“upload_content_length” //上传内容的长度l;
//“starttransfer_time” //开始传输的时间;“redirect_time”//重定向耗时
$result['info'] = curl_getinfo($ch);
$result['error'] = curl_error($ch);
curl_close($ch);
return $result;
}