curl最全的方法

 *微信公众号抓取数据,PHP中进行post提交
 * curl 支持post
 * @param string $base_url 基础链接
 * @param array $query_data 需要请求的数据
 * @param string $method 方法 get/post
 * @param boolean $ssl 关闭ssl验证
 * @param integer $exe_timeout 执行超时时间
 * @param integer $conn_timeout 连接超时时间
 * @param integer $dns_timeout dns超时时间
 */
private function tx_curl($base_url, $query_data, $method = 'get', $ssl = true, $exe_timeout = 10, $conn_timeout = 10, $dns_timeout = 3600)
{
    $ch = curl_init();

    if ( $method == 'get' ) {
        //method get
        if ( ( !empty($query_data) )
            && ( is_array($query_data) )
        ){
            $connect_symbol = (strpos($base_url, '?')) ? '&' : '?';
            foreach($query_data as $key => $val) {
                if ( is_array($val) ) {
                    $val = serialize($val);
                }
                $base_url .= $connect_symbol . $key . '=' . rawurlencode($val);
                $connect_symbol = '&';
            }
        }
    } else {
        if ( ( !empty($query_data) )
            && ( is_array($query_data) )
        ){
            foreach($query_data as $key => $val) {
                if ( is_array($val) ) {
                    $query_data[$key] = serialize($val);
                }
            }
        }
        //method post
        curl_setopt($ch, CURLOPT_POST, 1);
        curl_setopt($ch, CURLOPT_POSTFIELDS, $query_data);
    }
    curl_setopt($ch, CURLOPT_URL, $base_url);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conn_timeout);
    curl_setopt($ch, CURLOPT_DNS_CACHE_TIMEOUT, $dns_timeout);
    curl_setopt($ch, CURLOPT_TIMEOUT, $exe_timeout);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    // 关闭ssl验证
    if($ssl){
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    }

    $output = curl_exec($ch);

    if ( $output === FALSE )
        $output = '';

    curl_close($ch);
    return $output;
}

微信公众号 文章爬虫类

crawByUrl($url);
*/
class WxCrawler
{
  /** @var 代理  */
  protected $agent = [
      "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
      "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
      "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
      "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
      "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
      "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
      "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
      "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
      "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
      "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
      "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
  ];
  public $host = '';
  public $header = '';
  public $referer = '';
  public $antiLeech = '';
  public function __construct($host='', $referer='', $proxy=false)
  {
    /** @var 初始化curl信息  */
    $this->header  = $this->agent[rand(0,count($this->agent) - 1)];
    $this->referer = empty($referer)?'http://weixin.sogou.com/' : $referer;
    $this->host    = empty($host)?'weixin.sogou.com' : $host;
    /** @var 处理微信图片的防盗链 */
    $this->antiLeech = 'http://'.$_SERVER['SERVER_NAME'].'/admin.php/tool/wechat-img?url=';
  }
  /**
   * 爬取内容
   * @author bignerd
   * @since  2016-08-16T10:13:58+0800
   * @param  $url
   */
  public function _get($url)
  {
    // $ch=curl_init($url);
    // $options = [
    //   CURLOPT_USERAGENT => $this->agent,
    //   CURLOPT_REFERER => $this->referer,
    // ];
    // curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
    //     curl_setopt($ch,CURLOPT_BINARYTRANSFER,true);
    //     curl_setopt($ch,CURLOPT_TIMEOUT,60);
    //     $output=curl_exec($ch);
    //     return $output;
    $html = file_get_contents($url);
    return $html;
  }
  public function crawByUrl($url)
  {
    $content = $this->_get($url);
    var_dump($content);
    $basicInfo = $this->articleBasicInfo($content);
    list($content_html, $content_text) = $this->contentHandle($content);
    return array_merge($basicInfo,['content_html' => $content_html,'content_text' => $content_text]);
  }
  /**
   * 处理微信文章源码,提取文章主体,处理图片链接
   * @author bignerd
   * @since  2016-08-16T15:59:27+0800
   * @param  $content 抓取的微信文章源码
   * @return [带图html文本,无图html文本]
   */
  public function contentHandle($content)
  {
        $content_html_pattern = '/
(.*?)<\/div>/s'; preg_match_all($content_html_pattern, $content, $html_matchs); $content_html = $html_matchs[0][0]; /** @var 带图片html文本 */ $content_html = preg_replace_callback('/data-src="(.*?)"/', function($matches){ return 'src='.$this->antiLeech.urlencode($matches[1]); }, $content_html); /** @var 无图html文本 */ $content_text = preg_replace('//s','',$content_html); return [$content_html,$content_text]; } /** * 获取文章的基本信息 * @author bignerd * @since 2016-08-16T17:16:32+0800 * @param $content 文章详情源码 * @return $basicInfo */ public function articleBasicInfo($content) { //待获取item $item = [ 'ct' => 'date',//发布时间 'msg_title' => 'title',//标题 'msg_desc' => 'digest',//描述 'msg_link' => 'content_url',//文章链接 'msg_cdn_url' => 'cover',//封面图片链接 'nickname' => 'wechatname',//公众号名称 ]; $basicInfo = [ 'author' => '', 'copyright_stat' => '', ]; foreach ($item as $k => $v) { $pattern = '/ var '.$k.' = "(.*?)";/s'; preg_match_all($pattern,$content,$matches); if(array_key_exists(1, $matches) && !empty($matches[1][0])){ $basicInfo[$v] = $this->htmlTransform($matches[1][0]); }else{ $basicInfo[$v] = ''; } } /** 获取作者 */ preg_match('/(.*?)<\/em>/s', $content, $matchAuthor); if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1]; /** 文章类型 */ preg_match('/

 

你可能感兴趣的:(php)