php-Curl扩展一个简单示例-爬取新闻网站数据

**

本文章只是给一个简单的思路作为参考,当前脚本仅适用于指定网站,因为每个网站的结构都不一样,不可能做到通用。大家可以结合自己要爬取的新闻网站结构做出调整。

**


// php最大执行时间设置为:半个小时,php原来默认为30秒,爬不完
ini_set('max_execution_time', '1800');
// 爬取页面全部数据
function curlGetData($url)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt ($ch,  CURLOPT_HEADER,  false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $response = curl_exec($ch);
    curl_close($ch);
    $response = str_replace('gb2312','utf-8',$response);
    $response =  iconv("gb2312","utf-8//IGNORE",$response);
    return $response;
}

// 依次循环新闻id读取新闻详情信息,需要栏目id,以及url
  // 编写一个函数实现新闻详情信息读取
function getNewsInfo($id, $cateid){
    $url = "网站url地址/news_view.asp?id={$id}";
    $result = curlGetData($url);
    $pattern = '/
[\s\S]*
/is'; preg_match_all($pattern, $result, $result); $result = $result[0][0]; // 文章所有内容 // 依次获取信息 $newsInfo = []; $newsInfo['id'] = $id; // 获取新闻标题 $pattern = '/

[\s\S*]+<\/h2>/is'; preg_match($pattern, $result, $newsInfo['title']); $newsInfo['title'] = str_replace('

', '', $newsInfo['title']); $newsInfo['title'] = str_replace('

'
, '', $newsInfo['title']); $newsInfo['title'] = $newsInfo['title'][0]; // 获取新闻来源 $pattern = '/来源:[\w\x{4e00}-\x{9fa5}]+/ius'; preg_match($pattern, $result, $newsInfo['nfrom']); $newsInfo['nfrom'] = str_replace('来源:', '', $newsInfo['nfrom']); $newsInfo['nfrom'] = $newsInfo['nfrom'][0]; // 获取新闻作者 $pattern = '/作者:[\w\x{4e00}-\x{9fa5}]+/ius'; preg_match($pattern, $result, $newsInfo['author']); $newsInfo['author'] = str_replace('作者:', '', $newsInfo['author']); $newsInfo['author'] = $newsInfo['author'][0]; // 获取新闻发布时间 $pattern = '/发布:\d{2,4}\/\d{1,2}\/\d{1,2}/ius'; preg_match($pattern, $result, $newsInfo['createtime']); $newsInfo['createtime'] = str_replace('发布:', '', $newsInfo['createtime']); $newsInfo['createtime'] = $newsInfo['createtime'][0]; // 新闻栏目id $newsInfo['cateid'] = $cateid; // 点击量 $pattern = '/点击:\d*/is'; preg_match($pattern, $result, $newsInfo['click']); $newsInfo['click'] = intval(str_replace('点击:', '', $newsInfo['click'][0])); // 新闻内容 $pattern = '/
[\s\S]*
/is'; preg_match($pattern, $result, $newsInfo['content']); $newsInfo['content'] = str_replace('
', '', $newsInfo['content'][0]); $newsInfo['content'] = str_replace('
', '', $newsInfo['content']); $newsInfo['content'] = substr($newsInfo['content'], 0, strrpos($newsInfo['content'], '
'
)); // 图片url地址 $pattern = '/src=\"\/uploadfile\/[\w.]+\"/is'; preg_match_all($pattern, $newsInfo['content'], $newsimage); $newsimage = $newsimage[0]; $newsimage = str_replace('src="', '该网站地址', $newsimage); $newsimage = str_replace('"', '', $newsimage); foreach($newsimage as $val) { $picname = str_replace('新闻网站地址/uploadfile/', '', $val); $im = file_get_contents($val); file_put_contents('./uploadfile/' . $picname, $im); // crabImage($val, './uploadfile/', $filename); } return $newsInfo; } // 输入一个栏目id实现全部爬取 function curlGetByCateId($cid){ $url = "新闻网站地址/news_category.asp?id={$cid}"; $response = curlGetData($url); $pattern = '/; preg_match_all($pattern, $response, $response); $pageTotal = ceil(intval(substr($response[0][0], 18)) / 20); // 该栏目下新闻总页数 // 依次循环页码,获取该栏目所有新闻的id $newsId = []; // 保存所有的新闻id for ($i = 1; $i <= $pageTotal; $i++) { $url = "新闻网站地址/news_category.asp?id={$cid}&page={$i}"; $response = curlGetData($url); $pattern = '/href="news_view.asp\?id=\d+/is'; preg_match_all($pattern, $response, $response); $response = $response[0]; foreach ($response as $k => $v) { $response[$k] = intval(substr($v, 23)); } $newsId = array_merge($response, $newsId); } // 连接数据库 $pdo = new PDO('mysql:host=主机地址;dbname=curldemo;charset=utf8', '数据库用户名', '密码'); $pdo->exec('set names utf8'); $datas = []; //存放所有的数据 foreach($newsId as $val) { $datas = getNewsInfo($val, $cid); $sql = "insert into newsinfo(id,title,nfrom,content,createtime,author,click,cateid) values(:id,:title,:nfrom,:content,:createtime,:author,:click,:cateid)"; $stmt = $pdo->prepare($sql); $stmt->execute($datas); } } // 根据栏目id爬出来该栏目总页数(因为数据量较大,一下子爬取整站数据可能需要的时间比较长,所以我做了调整,根据栏目id每次只爬取一个栏目的内容,想要一次性爬完的同学可以自行扩展,无非就是多一个循环而已) // 入口函数 curlGetByCateId(7); echo "爬完了!!!造作啊!" . '7';

原文链接:https://blog.csdn.net/qq_42195688/article/details/80578107

你可能感兴趣的:(php数据爬取与信息采集)