php采集windows 10 app的信息

<?php
/**
 * test new page crawler
 */

#$url = 'http://apps.microsoft.com/windows/en-us/app/fotor/6f797ba2-500d-4dee-9c5a-13c2d818c958';
$url = 'https://www.microsoft.com/en-us/store/apps/adobe-photoshop-express/9wzdncrfj27n';

$url = trim($url);
$d = array();
                               
$content = html_entity_decode(get($url),ENT_HTML5,'UTF-8');

//pfn
$pfn = '';
if(preg_match('/data-pfn="(.*)">/isU', $content, $match)){
    $d[] = $match[1];
    $pfn = $match[1];
    echo "pfn:".$pfn."\n";
}else {
    echo 'pfn error:',$url,"\n";
    exit();
}

//new url
if(preg_match('/\[url:(.*)\]/is', $content,$match)){
	$newurl = $match[1];
	echo "url:".$newurl."\n";
}else{
	echo 'get no new url'."\n";
	exit();
}

//icon
if(preg_match('/class="pull-left ph-logo">.*src="(.*)".*style="background-color:(.*);.*"/isU', $content,$match)){
	$icon = $match[1];
	$backgroundcolor = $match[2];
	echo "icon:".$icon."\n";
	echo "backgroundcolor:".$backgroundcolor."\n";
}else{
	echo 'get no icon'."\n";
	exit();
}

//name
if(preg_match('/id="page-title".*itemprop="name">(.*)<\//',$content,$match)){
	$name = $match[1];
	echo "name:".$name."\n";
}else{
	echo 'get no name'."\n";
	exit();
}

//alias
if(preg_match('/apps\/(.*)\//isU',$newurl,$match)){
	$alias = $match[1];
	echo "alias:".$alias."\n";
}else{
	echo 'get no alias'."\n";
	exit();
}

//rating
if(preg_match('/class="srv_ratingsScore win-rating-average">(.*)<\//',$content, $match)){
	$rating = $match[1];
	echo "rating:".$rating."\n";
}else{
	echo 'get no rating'."\n";
	exit();
}

//rating num
if(preg_match('/class="win-rating-total">(.*)<\//',$content, $match)){
	$ratingcount = trim(str_replace('ratings','',preg_replace('/,/','', $match[1])));
	echo "Rating num:".$ratingcount."\n";
}else{
	echo 'get no rating num'."\n";
	exit();
}

//price
if(preg_match('/class="price srv_price"><span class="header-sub">(.*)<\//',$content, $match)){
	$price = $match[1];
	echo "prcie:".$price."\n";
}else{
	echo 'get no price'."\n";
	exit();
}

//category
if(preg_match('/<meta name="ms.prod_sbcat" content="(.*)" \/>/isU',$content, $match)){
	$category = trim($match[1]);
	echo "category:".$category."\n";
}else{
	if(preg_match('/<meta name="ms.prod_cat" content="(.*)" \/>/isU',$content, $match)){
		$category = trim($match[1]);
		echo "category:".$category."\n";
	}else{
		echo 'get no category'."\n";
		exit();	
	}
	
}

//content rating
if(preg_match('/Content Rating: <a .*>(.*)<\//isU',$content, $match)){
	$contentRating = trim($match[1]);
	echo "content rating:".$contentRating."\n";
}else{
	echo 'get no content rating'."\n";
	$contentRating = 'all';
}

//publisher
if(preg_match('/Publisher<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){
	$publisher = trim($match[1]);
	echo "publisher:".$publisher."\n";
}else{
	echo 'get no publisher'."\n";
	exit();
}

//works on
if(preg_match('/Works on: (.*)</isU', $content,$match)){
	$workson = trim($match[1]);
	echo 'works on:'.$workson."\n";
}else{
	echo 'get no works platform'."\n";
	//exit();
}

//size
if(preg_match('/Approximate size<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){
	$size = trim($match[1]);
	echo "size:".$size."\n";
}else{
	echo 'get no size'."\n";
	//exit();
}

//supportedprocessors
if(preg_match('/Supported processors<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){
	$processors = trim($match[1]);
	echo "processors:".$processors."\n";
}else{
	echo 'get no processors'."\n";
	//exit();
}

//age
if(preg_match('/Age rating<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){
	$age = trim($match[1]);
	echo "age:".$age."\n";
}else{
	echo 'get no age'."\n";
	//exit();
}

//languages
if(preg_match('/Supported languages<\/dt>.*<dd .*>(.*)<\/dd>/isU',$content,$match)){
	if(preg_match_all('/<div>([^<].*)<\/div>/',$match[1],$temp)){
		$languages = implode(",",$temp[1]);
		echo "languages:".$languages."\n";
	}
}else{
	echo 'get no languages'."\n";
	//exit();
}

//features
if(preg_match('/class="section-title.*">Features.*<ul>(.*)<\/ul>/isU',$content,$match)){
	if(preg_match_all('/<li class="avoid-break">(.*)<\/li>/isU', $match[1], $temp)){
		$features = $temp[1];
		echo 'features:';
		print_r($features);
		echo "\n";
	}
}else{
	echo 'get no features'."\n";
}

//release notes
if(preg_match('/class="section-title.*">Version Notes.*<p>(.*)<\/p>/isU',$content,$match)){
	$releasenotes = $match[1];
	echo "release Notes:".$releasenotes."\n";
}else{
	echo 'no version notes'."\n";
}

//screenshots
if(preg_match_all('/class="media-img ratio-16-9">.*<img src="(.*)".*\/>/isU', $content, $match)){
	$screenshots = $match[1];
	echo "screenshots:";
	print_r($screenshots);
	echo "\n";
}else{
	echo 'get no screenshots'."\n";
	exit();
}

//description
if(preg_match('/<div class="showmore m-t-pdp">.*<p.*>(.*)<\//isU', $content,$match)){
	$description = $match[1];
	echo 'description:'.$description."\n";
}else{
	echo "get no description content\n";
	exit();
}


exit();
 
function get($url) {
        $ch = curl_init ($url);

        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'); 
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        $output = curl_exec ($ch);
        $curlinfo = curl_getinfo($ch);
        $lasturl = $curlinfo['url'];
        curl_close($ch);

        return $output."[url:$lasturl]";
}


成品站:www.topwindata.com    ,windows 10 一发布,流量就翻番了,不过还是只有1000左右ip。

你可能感兴趣的:(php采集windows 10 app的信息)