最近,要抓取一些公众号的文章以及将里面的视频和视频封面搞下来,这个问题已解决,直接上代码
我是用Yii2.0框架的console脚本写的,其他的脚本可以修改
代码呼之欲出:
controller中:
/**
* 抓取微信公众号的文章和里面的视频 url
* @param $url
* @return bool
*/
public function actionGetwx($url)
{
if (empty($url)) {
echo '请输入公众号文章地址';
return false;
}
//微信的链接有长链和短链,以下为长链
//$url ='http://mp.weixin.qq.com/s?__biz=MzI0NTc1MTczNA==&mid=2247485130&idx=1&sn=945cfb8b0cfdd99f1b730889de0216e2&chksm=e9488c13de3f05057be6c6b065f8e44d43c566cb9ee3a4f35cf8084382742159181ea480b935&scene=27';
if (stripos($url, '?')) {
if (stripos($url, '#wechat_redirect')) {
$url = str_replace('#wechat_redirect', '', $url);
}
$json = $url . '&f=json';
} else {
$json = $url . '?f=json';
}
$data = Tools::curl_request($json);
$data = json_decode($data, 1);
echo '开始采集了...';
echo '======文章内容====';
print_r($data);
//data 为文章的详情
$html = $data['content_noencode'];
preg_match_all('/
//没有视频脚本退出
if (empty($matchs[2])) {
echo '没有视频匹配到,不采集';
die;
}
//https://v.qq.com/iframe/preview.html?vid=i1324786hv8&width=500&height=375&auto=0
preg_match_all('/vid=(.*?)&/', $matchs[2][0], $vidArray);
//获取到vid
$vid = $vidArray[1][0];
echo '=======vid=========';
print_r($vid);
echo '======vid==========';
$video_json = Tools::getinfo($vid);
echo '=====getinfo=====';
print_r($video_json);
echo '======getinfo====';
$title = $video_json['vl']['vi'][0]['ti'];
//高质量视频
$fn_pre = $video_json['vl']['vi'][0]['lnk'];
$host = $video_json['vl']['vi'][0]['ul']['ui'][0]['url'];
$streams = $video_json['fl']['fi'];
$seg_cnt = $video_json['vl']['vi'][0]['cl']['fc'];
$best_quality = end($streams)['name'];
$part_format_id = end($streams)['id'];
$part_urls = [];
echo '========计数=======' . $seg_cnt;
for ($part = 1; $part <= $seg_cnt + 1; $part++) {
$filename = $fn_pre . '.p' . ($part_format_id % 10000) . '.' . $part . '.mp4';
$key_api = "http://vv.video.qq.com/getkey?otype=json&platform=11&format="
. $part_format_id . "&vid=" . $vid . "&filename=" . $filename . "&appver=3.2.19.333";
$part_info = Tools::curl($key_api);
preg_match('/QZOutputJson=(.*);$/Uis', $part_info, $key_json);
$key_json = json_decode($key_json[1], 1);
echo '=======getkey=============';
print_r($key_json);
echo '========getkey=============';
if (empty($key_json['key'])) {
$vkey = $video_json['vl']['vi'][0]['fvkey'];
$url = $video_json['vl']['vi'][0]['ul']['ui'][0]['url'] . $fn_pre . '.mp4?vkey=' . $vkey;
} else {
$vkey = $key_json['key'];
$url = $host . $filename . "?vkey=" . $vkey;
}
$part_urls[] = $url;
}
//真实的地址
print_r($part_urls);
// if (empty($part_urls)) {
// //获取的视频质量低
// if (!empty($video_json['vl']['vi'])) {
// $keys = [];
// foreach ($video_json['vl']['vi'] as $key => $value) {
// $fvkey = $value['fvkey'];
// $fn = $value['fn'];
// $self_host = $value['ul']['ui'][$key]['url'];
// $keys['fvkey'] = $fvkey;
// $keys['fn'] = $fn;
// $keys['self_host'] = $self_host;
// $keys['lnk'] = $value['lnk'];
// }
// $part_urls[0] = $keys['self_host'] . $keys['fn'] . '?vkey=' . $keys['fvkey'];
// }
// }
$videoUrl = $part_urls[0];
//https://shp.qpic.cn/qqvideo_ori/0/i1324786hv8_496_280/0
//https://shp.qpic.cn/qqvideo_ori/0/o13389u1u8i_496_280/0
$imageUrl = sprintf('https://shp.qpic.cn/qqvideo_ori/0/%s_496_280/0', $vid);
$type = 'png';
$gzhName = $data['nick_name'];
$savePath = '/opt/sdb/samba/pub/miaomiao/';
//下载图片
self::DownloadGzhInfo($imageUrl, $savePath, $gzhName, $data['title'], $type, $title);
//下载视频
self::DownloadGzhInfo($videoUrl, $savePath, $gzhName, $data['title'], 'mp4', $title);
}
public static function DownloadGzhInfo($url, $save_path, $gzhName, $title, $type, $videoTitle)
{
$title = Tools::strFilter($title);
$cmd = 'cd ' . $save_path;
exec($cmd);
$file = $save_path . $gzhName;
if (!file_exists($file) && !mkdir($file, 0777, true)) {
return false;
}
$save_path_dir = $file . '/' . $title;
if (!file_exists($save_path_dir) && !mkdir($save_path_dir, 0777, true)) {
return false;
}
if ($type == 'png') {
$cmd = 'wget -c ' . $url . ' -O ' . $save_path_dir . '/"' . $videoTitle . '".' . $type;
} else {
$expension = Tools::getExt($url);
$cmd = 'wget -c ' . $url . ' -O ' . $save_path_dir . '/"' . $videoTitle . '".' . $expension;
}
exec($cmd);
}
用到的Tools方法:
//参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies
public static function curl_request($url, $post = '', $cookie = '', $returnCookie = 0)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
if ($post) {
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
}
if ($cookie) {
curl_setopt($curl, CURLOPT_COOKIE, $cookie);
}
curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
if (curl_errno($curl)) {
return curl_error($curl);
}
curl_close($curl);
if ($returnCookie) {
list($header, $body) = explode("\r\n\r\n", $data, 2);
preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
$info['cookie'] = substr($matches[1][0], 1);
$info['content'] = $body;
return $info;
} else {
return $data;
}
}
public static function getinfo($vid)
{
// $json_type = 'json';
// $platform = '11001';
// $guid = 'ba7f7fab8f8aef2c4ae45883f221c04d';
// $sdtfrom = 'v1010';
// $_qv_rmt = '703hXQMOA1937593I%3D';
// $_qv_rmt2 = 'egp3C2aj149060UUA%3D';
// $url = sprintf('http://vv.video.qq.com/getinfo?vid=%s&otype=%s&guid=%s&platform=%s&sdtfrom=%s&_qv_rmt=%s&_qv_rmt2=%s', $vid, $json_type, $guid, $platform, $sdtfrom, $_qv_rmt, $_qv_rmt2);
$infourl = 'https://vv.video.qq.com/getinfo?otype=json&appver=3.2.19.333&platform=11&defnpayver=1&vid=' . $vid;
$data = self::curl($infourl);
preg_match('/QZOutputJson=(.*);$/Uis', $data, $info);
return json_decode($info[1], 1);
}
public static function getExt($url)
{
$urlinfo = parse_url($url);
$file = basename($urlinfo['path']);
if (strpos($file, '.') !== false) {
$ext = explode('.', $file);
return $ext[count($ext) - 1];
}
return 'no extension';
}
public static function strFilter($str)
{
$str = str_replace(' ', '', $str);
$str = str_replace(' ', '', $str);
$str = str_replace('`', '', $str);
$str = str_replace('·', '', $str);
$str = str_replace('~', '', $str);
$str = str_replace('!', '', $str);
$str = str_replace('!', '', $str);
$str = str_replace('@', '', $str);
$str = str_replace('#', '', $str);
$str = str_replace('$', '', $str);
$str = str_replace('¥', '', $str);
$str = str_replace('%', '', $str);
$str = str_replace('^', '', $str);
$str = str_replace('……', '', $str);
$str = str_replace('&', '', $str);
$str = str_replace('*', '', $str);
$str = str_replace('(', '', $str);
$str = str_replace(')', '', $str);
$str = str_replace('(', '', $str);
$str = str_replace(')', '', $str);
$str = str_replace('-', '', $str);
$str = str_replace('_', '', $str);
$str = str_replace('——', '', $str);
$str = str_replace('+', '', $str);
$str = str_replace('=', '', $str);
$str = str_replace('|', '', $str);
$str = str_replace('\\', '', $str);
$str = str_replace('[', '', $str);
$str = str_replace(']', '', $str);
$str = str_replace('【', '', $str);
$str = str_replace('】', '', $str);
$str = str_replace('{', '', $str);
$str = str_replace('}', '', $str);
$str = str_replace(';', '', $str);
$str = str_replace(';', '', $str);
$str = str_replace(':', '', $str);
$str = str_replace(':', '', $str);
$str = str_replace('\'', '', $str);
$str = str_replace('"', '', $str);
$str = str_replace('“', '', $str);
$str = str_replace('”', '', $str);
$str = str_replace(',', '', $str);
$str = str_replace(',', '', $str);
$str = str_replace('<', '', $str);
$str = str_replace('>', '', $str);
$str = str_replace('《', '', $str);
$str = str_replace('》', '', $str);
$str = str_replace('.', '', $str);
$str = str_replace('。', '', $str);
$str = str_replace('/', '', $str);
$str = str_replace('、', '', $str);
$str = str_replace('?', '', $str);
$str = str_replace('?', '', $str);
return trim($str);
}
public static function curl($url, $option = [])
{
$split = explode('/', $url);
$cookiejar = str_replace('\\', '/', dirname(__FILE__)) . '/' . $split[2] . '.cookie';
$ch = curl_init();
$options = [
CURLOPT_URL => $url,
CURLOPT_HTTPHEADER => [
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset: UTF-8,*;q=0.5",
"Accept-Encoding': 'gzip,deflate,sdch",
"Accept-Language': 'en-US,en;q=0.8",
"User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
],
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_CONNECTTIMEOUT => 5,
CURLOPT_TIMEOUT => 5,
CURLOPT_COOKIEJAR => $cookiejar,
CURLOPT_COOKIEFILE => $cookiejar,
];
curl_setopt_array($ch, $options);
$response = curl_exec($ch);
curl_close($ch);
return $response;
}