php实现爬虫抓取法定节假日放假和补班安排数据

今天闲聊群有朋友问我怎么判断今天是不是法定节假日,是上班还是放假还是假期补班,想了想应该有相应的api,直接去调用就行了,但也可以去国务院官网发的放假通知上面去抓取,今天就跟大家分享下php实现爬虫抓取页面。

说到爬虫,大家首先想到的是python,其实什么语言都一样,无非是抓取网页数据,然后正则分析网页结构,把想要的信息取出来,更深层次的无非是递归爬取所有链接,数据库操作等,php做爬虫也不算稀奇,今天这个需求很简单,就是抓取一篇放假通知的文章,从里面整理出法定节假日期间哪几天放假,哪几天补班,整理成规则的数组或者json,主要用到curl和正则。

下面还是先上代码

getHolidays();
	}
	public function getHolidays(){
		$data=[];
		$html=$this->curlGet("http://www.gov.cn/zhengce/content/2021-10/25/content_5644835.htm");
		preg_match_all('//s',$html,$pHandel);
		foreach($pHandel[0] as $key=>&$value){
			if(preg_match('/(共[1-9]天)/s',$value,$tempHandel)==1){
				$value=strip_tags($value);
				$value=explode("。",$value);
				$holidayName=explode("、",explode(":",$value[0])[0])[1];
				$start=preg_replace("/(年|月)/", "-",explode("日至",explode("、",explode(":",$value[0])[1])[0])[0]);
				if(strlen($start)<7){
					$start=date("Y")."-".$start;
				}
				$start=strtotime($start);
				$length=(int)preg_replace("/(共|天)/","",$tempHandel[0]);
				for($i=0;$i<$length;$i++){
					$item=[];
					$item["holidayName"]=$holidayName;
					$item["type"]="休假";
					$item["date"]=date("Y-m-d",$start+$i*86400+1);
					$data[]=$item;
				}
				if(count($value)==3){
					$value[1]=explode("、",$value[1]);
					foreach($value[1] as $ke=>&$val){
						$val=date("y")."-".explode("日(",str_replace("月", "-",$val))[0];
						$item=[];
						$item["holidayName"]=$holidayName."补班";
						$item["type"]="正常上班";
						$item["date"]=$val;
						$data[]=$item;
					}
				}
			}else{

			}
		}
		//echo json_encode($data,JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE);
		$this->dump($data);
	}
	private function curlGet($url, $username=null, $password=null) {
		$ch = curl_init();
		$header = array(
			'Content-Type: text/xml',
		);
		curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
		curl_setopt($ch, CURLOPT_URL, $url);
		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
		curl_setopt($ch, CURLOPT_TIMEOUT, 30); //30秒超时
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
		curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
		if(!is_null($username)&&!is_null($password)){
			curl_setopt($ch, CURLOPT_USERPWD, "$username:$password");
		}
		$status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); //get status code
		$result = curl_exec($ch);
		if (curl_error($ch)) {
			//curl_error($ch);//错误原因
			curl_close($ch);
			return false;
		} else {
			curl_close($ch);
			return $result;
		}
	}
	private function dump($data){
		echo "
";
		var_dump($data);
		echo "
"; } } $c=new calendar();

也很简单,就是curl请求抓取网页源码,想尽一切办法把自己想要的数据抠出来,可以自己用正则匹配,可以用dom类,也可以用第三方库,我这个需求比较简单,就自己用正则和字符串操作把内容转换成数组,不得不说php的精髓就是字符串和数组,尤其是数组,一个array实现了各种类型,字符串的函数也很方便,很多都是直接调用C库函数。

运行了一下,爬虫爬到了放假和补班的日期数据,如下:

array(38) {
  [0]=>
  array(3) {
    ["holidayName"]=>
    string(6) "元旦"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-01-01"
  }
  [1]=>
  array(3) {
    ["holidayName"]=>
    string(6) "元旦"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-01-02"
  }
  [2]=>
  array(3) {
    ["holidayName"]=>
    string(6) "元旦"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-01-03"
  }
  [3]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-01-31"
  }
  [4]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-01"
  }
  [5]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-02"
  }
  [6]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-03"
  }
  [7]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-04"
  }
  [8]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-05"
  }
  [9]=>
  array(3) {
    ["holidayName"]=>
    string(6) "春节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-02-06"
  }
  [10]=>
  array(3) {
    ["holidayName"]=>
    string(12) "春节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(7) "22-1-29"
  }
  [11]=>
  array(3) {
    ["holidayName"]=>
    string(12) "春节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(7) "22-1-30"
  }
  [12]=>
  array(3) {
    ["holidayName"]=>
    string(9) "清明节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-04-03"
  }
  [13]=>
  array(3) {
    ["holidayName"]=>
    string(9) "清明节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-04-04"
  }
  [14]=>
  array(3) {
    ["holidayName"]=>
    string(9) "清明节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-04-05"
  }
  [15]=>
  array(3) {
    ["holidayName"]=>
    string(15) "清明节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(6) "22-4-2"
  }
  [16]=>
  array(3) {
    ["holidayName"]=>
    string(9) "劳动节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-04-30"
  }
  [17]=>
  array(3) {
    ["holidayName"]=>
    string(9) "劳动节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-05-01"
  }
  [18]=>
  array(3) {
    ["holidayName"]=>
    string(9) "劳动节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-05-02"
  }
  [19]=>
  array(3) {
    ["holidayName"]=>
    string(9) "劳动节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-05-03"
  }
  [20]=>
  array(3) {
    ["holidayName"]=>
    string(9) "劳动节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-05-04"
  }
  [21]=>
  array(3) {
    ["holidayName"]=>
    string(15) "劳动节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(7) "22-4-24"
  }
  [22]=>
  array(3) {
    ["holidayName"]=>
    string(15) "劳动节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(6) "22-5-7"
  }
  [23]=>
  array(3) {
    ["holidayName"]=>
    string(9) "端午节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-06-03"
  }
  [24]=>
  array(3) {
    ["holidayName"]=>
    string(9) "端午节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-06-04"
  }
  [25]=>
  array(3) {
    ["holidayName"]=>
    string(9) "端午节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-06-05"
  }
  [26]=>
  array(3) {
    ["holidayName"]=>
    string(9) "中秋节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-09-10"
  }
  [27]=>
  array(3) {
    ["holidayName"]=>
    string(9) "中秋节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-09-11"
  }
  [28]=>
  array(3) {
    ["holidayName"]=>
    string(9) "中秋节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-09-12"
  }
  [29]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-01"
  }
  [30]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-02"
  }
  [31]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-03"
  }
  [32]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-04"
  }
  [33]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-05"
  }
  [34]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-06"
  }
  [35]=>
  array(3) {
    ["holidayName"]=>
    string(9) "国庆节"
    ["type"]=>
    string(6) "休假"
    ["date"]=>
    string(10) "2022-10-07"
  }
  [36]=>
  array(3) {
    ["holidayName"]=>
    string(15) "国庆节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(7) "22-10-8"
  }
  [37]=>
  array(3) {
    ["holidayName"]=>
    string(15) "国庆节补班"
    ["type"]=>
    string(12) "正常上班"
    ["date"]=>
    string(7) "22-10-9"
  }
}

你可能感兴趣的:(爬虫,php,前端,javascript,python)