今天闲聊群有朋友问我怎么判断今天是不是法定节假日,是上班还是放假还是假期补班,想了想应该有相应的api,直接去调用就行了,但也可以去国务院官网发的放假通知上面去抓取,今天就跟大家分享下php实现爬虫抓取页面。
说到爬虫,大家首先想到的是python,其实什么语言都一样,无非是抓取网页数据,然后正则分析网页结构,把想要的信息取出来,更深层次的无非是递归爬取所有链接,数据库操作等,php做爬虫也不算稀奇,今天这个需求很简单,就是抓取一篇放假通知的文章,从里面整理出法定节假日期间哪几天放假,哪几天补班,整理成规则的数组或者json,主要用到curl和正则。
下面还是先上代码
getHolidays();
}
public function getHolidays(){
$data=[];
$html=$this->curlGet("http://www.gov.cn/zhengce/content/2021-10/25/content_5644835.htm");
preg_match_all('//s',$html,$pHandel);
foreach($pHandel[0] as $key=>&$value){
if(preg_match('/(共[1-9]天)/s',$value,$tempHandel)==1){
$value=strip_tags($value);
$value=explode("。",$value);
$holidayName=explode("、",explode(":",$value[0])[0])[1];
$start=preg_replace("/(年|月)/", "-",explode("日至",explode("、",explode(":",$value[0])[1])[0])[0]);
if(strlen($start)<7){
$start=date("Y")."-".$start;
}
$start=strtotime($start);
$length=(int)preg_replace("/(共|天)/","",$tempHandel[0]);
for($i=0;$i<$length;$i++){
$item=[];
$item["holidayName"]=$holidayName;
$item["type"]="休假";
$item["date"]=date("Y-m-d",$start+$i*86400+1);
$data[]=$item;
}
if(count($value)==3){
$value[1]=explode("、",$value[1]);
foreach($value[1] as $ke=>&$val){
$val=date("y")."-".explode("日(",str_replace("月", "-",$val))[0];
$item=[];
$item["holidayName"]=$holidayName."补班";
$item["type"]="正常上班";
$item["date"]=$val;
$data[]=$item;
}
}
}else{
}
}
//echo json_encode($data,JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE);
$this->dump($data);
}
private function curlGet($url, $username=null, $password=null) {
$ch = curl_init();
$header = array(
'Content-Type: text/xml',
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($ch, CURLOPT_TIMEOUT, 30); //30秒超时
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
if(!is_null($username)&&!is_null($password)){
curl_setopt($ch, CURLOPT_USERPWD, "$username:$password");
}
$status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); //get status code
$result = curl_exec($ch);
if (curl_error($ch)) {
//curl_error($ch);//错误原因
curl_close($ch);
return false;
} else {
curl_close($ch);
return $result;
}
}
private function dump($data){
echo "
";
var_dump($data);
echo "
";
}
}
$c=new calendar();
也很简单,就是curl请求抓取网页源码,想尽一切办法把自己想要的数据抠出来,可以自己用正则匹配,可以用dom类,也可以用第三方库,我这个需求比较简单,就自己用正则和字符串操作把内容转换成数组,不得不说php的精髓就是字符串和数组,尤其是数组,一个array实现了各种类型,字符串的函数也很方便,很多都是直接调用C库函数。
运行了一下,爬虫爬到了放假和补班的日期数据,如下:
array(38) { [0]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-01" } [1]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-02" } [2]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-03" } [3]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-31" } [4]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-01" } [5]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-02" } [6]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-03" } [7]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-04" } [8]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-05" } [9]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-06" } [10]=> array(3) { ["holidayName"]=> string(12) "春节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-1-29" } [11]=> array(3) { ["holidayName"]=> string(12) "春节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-1-30" } [12]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-03" } [13]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-04" } [14]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-05" } [15]=> array(3) { ["holidayName"]=> string(15) "清明节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(6) "22-4-2" } [16]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-30" } [17]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-01" } [18]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-02" } [19]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-03" } [20]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-04" } [21]=> array(3) { ["holidayName"]=> string(15) "劳动节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-4-24" } [22]=> array(3) { ["holidayName"]=> string(15) "劳动节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(6) "22-5-7" } [23]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-03" } [24]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-04" } [25]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-05" } [26]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-10" } [27]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-11" } [28]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-12" } [29]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-01" } [30]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-02" } [31]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-03" } [32]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-04" } [33]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-05" } [34]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-06" } [35]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-07" } [36]=> array(3) { ["holidayName"]=> string(15) "国庆节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-10-8" } [37]=> array(3) { ["holidayName"]=> string(15) "国庆节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-10-9" } }