抓取苹果应用评论

菜鸟,经过半天的摸索,终于找到抓取苹果商店应用评论的方法了

1.抓取以下信息的URL = https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current


抓取苹果应用评论_第1张图片
 

2.抓取评论的URL = https://itunes.apple.com/WebObjects/MZStore.woa/wa/userReviewsRow?id=442673238&displayable-kind=11&startIndex=0&endIndex=100&sort=1&appVersion=current


抓取苹果应用评论_第2张图片

 

注意:以上只要笔记为红色的部分,是可变部分,其中:id是苹果商店的id,appVersion是指抓取的评论是当前版本还是所有有两个值(current、all)

startIndex,endIndex是分页抓取

 

我使用的方法是用php模拟抓取,记得http头信息一定加上。

不喜欢废话,我的php代码大概是这样的,只是提供部分代码,大概逻辑就是这样的

	//拿评论总量 及 评分
		$remote_url = "https://itunes.apple.com/cn/customer-reviews/id442673238?dataOnly=true&displayable-kind=11&appVersion=current";
		$util = new Utils();
		$httpheader = array("iTunes/11.1.3 (Windows; Microsoft Windows 7 x64 Ultimate Edition (Build 7600)) AppleWebKit/536.30.1",
				"Accept-Language:zh-CN,zh;q=0.8",
				"X-Apple-Store-Front:143465-19,17",
				"X-Apple-Tz:28800",
				"Host:itunes.apple.com",
				"Accept-Encoding: gzip;q=1.0, identity; q=0.5, *;q=0"
		);
		$ret = $util->curlPost($remote_url,array(),100,false,$httpheader);
	
		if(!empty($ret['error'])){
			return false;
		}
		
		$totalContent = $ret['content'];
// 		$totalContent_arr = explode('Cache-Control: public,no-transform,max-age=60', $totalContent);
// 		$totalContent = trim($totalContent_arr[1]);
		$start = strpos($totalContent, '{');
		$totalContent = substr($totalContent, $start);
		
		$total_arr = json_decode($totalContent,true);
		
		if($total_arr['totalNumberOfReviews'] === 0){
			return true;
		}
		
		if(!$total_arr['totalNumberOfReviews']){
			return false;
		}
		
		$totalComment = $total_arr['totalNumberOfReviews'];

 

 其中curlPost 方法为

 function curlPost($url, $data = array(), $timeout = 30, $CA = false, $httpheader = null){
    
    	$cacert = getcwd() . '/cacert.pem'; //CA根证书
    	$SSL = substr($url, 0, 8) == "https://" ? true : false;
    	 
    	$ch = curl_init();
    	curl_setopt($ch, CURLOPT_URL, $url);
    	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout-2);
    	if ($SSL && $CA) {
    		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);   // 只信任CA颁布的证书
    		curl_setopt($ch, CURLOPT_CAINFO, $cacert); // CA根证书(用来验证的网站证书是否是CA颁布)
    		curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); // 检查证书中是否设置域名,并且是否与提供的主机名匹配
    	} else if ($SSL && !$CA) {
    		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书
    		curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1); // 检查证书中是否设置域名
    	}
    	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    	curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:')); //避免data数据过长问题
    	curl_setopt($ch, CURLOPT_POST, true);
    	curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    	curl_setopt($ch, CURLOPT_HEADER, 1);
    	 
    	 
    	/*$httpheader = array(
    	 "User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
    			"Accept-Language: zh-CN,zh;q=0.8"
    	);*/
    	if(!empty($httpheader)){
    		curl_setopt($ch, CURLOPT_HTTPHEADER, $httpheader);
    	}
    
    	//curl_setopt($ch, CURLOPT_NOBODY, 1);	//页面内容我们并不需要
    	//curl_setopt($ch, CURLOPT_HEADER, 1);	//只需返回HTTP header
    	 
    	$result = array();
    	$result['content'] = curl_exec($ch);
    	//返回的数组中包括了以下信息:“url” //资源网络地址;“content_type” //内容编码;“http_code” //HTTP状态码;“header_size” //header的大小;“request_size” //请求的大小;
    	//“filetime” //文件创建时间;“ssl_verify_result” //SSL验证结果;“redirect_count” //跳转技术;“total_time” //总耗时;“namelookup_time” //DNS查询耗时;
    	//“connect_time” //等待连接耗时;“pretransfer_time” //传输前准备耗时;“size_upload” //上传数据的大小;“size_download” //下载数据的大小
    	//“speed_download” //下载速度;“speed_upload” //上传速度;“download_content_length”//下载内容的长度;“upload_content_length” //上传内容的长度l;
    	//“starttransfer_time” //开始传输的时间;“redirect_time”//重定向耗时
    	$result['info'] = curl_getinfo($ch);
    	$result['error'] = curl_error($ch);
    
    	curl_close($ch);
    	return $result;
    }

 

 

 

 


 

你可能感兴趣的:(php)