<?php /** * test new page crawler */ #$url = 'http://apps.microsoft.com/windows/en-us/app/fotor/6f797ba2-500d-4dee-9c5a-13c2d818c958'; $url = 'https://www.microsoft.com/en-us/store/apps/adobe-photoshop-express/9wzdncrfj27n'; $url = trim($url); $d = array(); $content = html_entity_decode(get($url),ENT_HTML5,'UTF-8'); //pfn $pfn = ''; if(preg_match('/data-pfn="(.*)">/isU', $content, $match)){ $d[] = $match[1]; $pfn = $match[1]; echo "pfn:".$pfn."\n"; }else { echo 'pfn error:',$url,"\n"; exit(); } //new url if(preg_match('/\[url:(.*)\]/is', $content,$match)){ $newurl = $match[1]; echo "url:".$newurl."\n"; }else{ echo 'get no new url'."\n"; exit(); } //icon if(preg_match('/class="pull-left ph-logo">.*src="(.*)".*style="background-color:(.*);.*"/isU', $content,$match)){ $icon = $match[1]; $backgroundcolor = $match[2]; echo "icon:".$icon."\n"; echo "backgroundcolor:".$backgroundcolor."\n"; }else{ echo 'get no icon'."\n"; exit(); } //name if(preg_match('/id="page-title".*itemprop="name">(.*)<\//',$content,$match)){ $name = $match[1]; echo "name:".$name."\n"; }else{ echo 'get no name'."\n"; exit(); } //alias if(preg_match('/apps\/(.*)\//isU',$newurl,$match)){ $alias = $match[1]; echo "alias:".$alias."\n"; }else{ echo 'get no alias'."\n"; exit(); } //rating if(preg_match('/class="srv_ratingsScore win-rating-average">(.*)<\//',$content, $match)){ $rating = $match[1]; echo "rating:".$rating."\n"; }else{ echo 'get no rating'."\n"; exit(); } //rating num if(preg_match('/class="win-rating-total">(.*)<\//',$content, $match)){ $ratingcount = trim(str_replace('ratings','',preg_replace('/,/','', $match[1]))); echo "Rating num:".$ratingcount."\n"; }else{ echo 'get no rating num'."\n"; exit(); } //price if(preg_match('/class="price srv_price"><span class="header-sub">(.*)<\//',$content, $match)){ $price = $match[1]; echo "prcie:".$price."\n"; }else{ echo 'get no price'."\n"; exit(); } //category if(preg_match('/<meta name="ms.prod_sbcat" content="(.*)" \/>/isU',$content, $match)){ $category = trim($match[1]); echo "category:".$category."\n"; }else{ if(preg_match('/<meta name="ms.prod_cat" content="(.*)" \/>/isU',$content, $match)){ $category = trim($match[1]); echo "category:".$category."\n"; }else{ echo 'get no category'."\n"; exit(); } } //content rating if(preg_match('/Content Rating: <a .*>(.*)<\//isU',$content, $match)){ $contentRating = trim($match[1]); echo "content rating:".$contentRating."\n"; }else{ echo 'get no content rating'."\n"; $contentRating = 'all'; } //publisher if(preg_match('/Publisher<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){ $publisher = trim($match[1]); echo "publisher:".$publisher."\n"; }else{ echo 'get no publisher'."\n"; exit(); } //works on if(preg_match('/Works on: (.*)</isU', $content,$match)){ $workson = trim($match[1]); echo 'works on:'.$workson."\n"; }else{ echo 'get no works platform'."\n"; //exit(); } //size if(preg_match('/Approximate size<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){ $size = trim($match[1]); echo "size:".$size."\n"; }else{ echo 'get no size'."\n"; //exit(); } //supportedprocessors if(preg_match('/Supported processors<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){ $processors = trim($match[1]); echo "processors:".$processors."\n"; }else{ echo 'get no processors'."\n"; //exit(); } //age if(preg_match('/Age rating<\/dt>.*<div class="content.*".*>(.+)<\//isU',$content, $match)){ $age = trim($match[1]); echo "age:".$age."\n"; }else{ echo 'get no age'."\n"; //exit(); } //languages if(preg_match('/Supported languages<\/dt>.*<dd .*>(.*)<\/dd>/isU',$content,$match)){ if(preg_match_all('/<div>([^<].*)<\/div>/',$match[1],$temp)){ $languages = implode(",",$temp[1]); echo "languages:".$languages."\n"; } }else{ echo 'get no languages'."\n"; //exit(); } //features if(preg_match('/class="section-title.*">Features.*<ul>(.*)<\/ul>/isU',$content,$match)){ if(preg_match_all('/<li class="avoid-break">(.*)<\/li>/isU', $match[1], $temp)){ $features = $temp[1]; echo 'features:'; print_r($features); echo "\n"; } }else{ echo 'get no features'."\n"; } //release notes if(preg_match('/class="section-title.*">Version Notes.*<p>(.*)<\/p>/isU',$content,$match)){ $releasenotes = $match[1]; echo "release Notes:".$releasenotes."\n"; }else{ echo 'no version notes'."\n"; } //screenshots if(preg_match_all('/class="media-img ratio-16-9">.*<img src="(.*)".*\/>/isU', $content, $match)){ $screenshots = $match[1]; echo "screenshots:"; print_r($screenshots); echo "\n"; }else{ echo 'get no screenshots'."\n"; exit(); } //description if(preg_match('/<div class="showmore m-t-pdp">.*<p.*>(.*)<\//isU', $content,$match)){ $description = $match[1]; echo 'description:'.$description."\n"; }else{ echo "get no description content\n"; exit(); } exit(); function get($url) { $ch = curl_init ($url); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); $output = curl_exec ($ch); $curlinfo = curl_getinfo($ch); $lasturl = $curlinfo['url']; curl_close($ch); return $output."[url:$lasturl]"; }
成品站:www.topwindata.com ,windows 10 一发布,流量就翻番了,不过还是只有1000左右ip。