抓取“维库电子市场”供应商程序

<?php
/** 
* 抓取“维库电子市场”供应商主程序 
* author Lee. 
* Last modify $Date: 2012-2-3 9:30:21 $ 
* 注:本程序按照编码 GB2312 执行,因为“维库电子市场”网站是GB2312编码,数据库也得保持一致
*/
class weiku {
	private $key; // 型号
	private $pageNum; // 页码

	/**
	 * 入口程序
	 */
	public function go($key) {
		$this->key = $key;
		$this->pageNum = $this->getPageNum();
		$this->getInfo();
	}

	/**
	 * 获取供应商 url 链接数组
	 * @return ArrayObject
	 */
	private function getInfo() {
		if ($this->pageNum==1) { # 处理只有一页的情况
			$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
			$this->isAddSuccess($arr);
		} elseif ($this->pageNum>1) { # 多页
			for ($i=1; $i<=$this->pageNum; $i++) {
				$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
				$this->isAddSuccess($arr);
			}
		}	
	}
	
	/**
	 * 打印是否添加成功
	 * @param ArrayObject $arr
	 * @return string
	 */
	private function isAddSuccess($arr) {
		foreach ($arr as $k=>$v) {
			if ($this->execAdd($this->getInfoByShopUrl($v))) {
				echo 'Add Success!!';
			} else {
				echo 'Add Faild!!';
			}
		}
	}

	/**
	 * 执行添加到数据库
	 * @param ArrayObject $infoArr
	 * @return Number 受影响的行数
	 */
	private function execAdd($infoArr) {
		$mysqli = $this->getDb();
		if (!empty($infoArr['company'])) {
			if (!$this->isExists($mysqli, $infoArr)) {
				$num = $mysqli->query("INSERT INTO weiku(company,person,phone,fax,mobile,qq,msn,email,address,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['fax']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['shopUrl']}')");
				return $num;
			} else {
				return false; # 表示数据已经存在
			}
		} else {
			return false;
		}
	}

	/**
	 * 连接数据库
	 */
	private function getDb() {
		$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
		$mysqli->query('SET NAMES GB2312');
		return $mysqli;
	}

	/**
	 * 检查公司是否已经存在
	 * @param Resource $mysqli
	 * @param ArrayObject $infoArr
	 * @return bool
	 */
	private function isExists($mysqli, $infoArr) {
		$mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");
		if ($mysqli->affected_rows) {
			return true;
		} else {
			return false;
		}
	}

	/**
	 * 抓取信息
	 * @param $url 
	 * @return ArrayObject
	 */
	private function getInfoByShopUrl($url) {
		$re = $this->getUrlInfo($url);
		preg_match_all('/<b>公司名称:<\/b><span>(.*)<\/span>/Usi', $re, $companyArr);
		preg_match_all('/<b>联系人:<\/b><span>(.*)<\/span>/Usi', $re, $personArr);
		preg_match_all('/<b>电话:<\/b><span>(.*)<\/span>/Usi', $re, $phoneArr);
		preg_match_all('/<b>传真:<\/b><span>(.*)<\/span>/Usi', $re, $faxArr);
		preg_match_all('/<b>手机:<\/b><span>(.*)<\/span>/Usi', $re, $mobileArr);
		preg_match_all('/<b>QQ:<\/b><span>(.*)<\/span>/Usi', $re, $qqArr);
		preg_match_all('/<b>MSN:<\/b><span>(.*)<\/span>/Usi', $re, $msnArr);
		preg_match_all('/<b>E-Mail:<\/b><span>(.*)<\/span>/Usi', $re, $emailArr);
		preg_match_all('/<b>公司地址:<\/b><span>(.*)<\/span>/Usi', $re, $addressArr);
		$infoArr = array(
			'company'=>$this->stripATags($companyArr[1][0]),
			'person'=>trim($personArr[1][0]),
			'phone'=>trim($phoneArr[1][0]),
			'fax'=>trim($faxArr[1][0]),
			'mobile'=>trim($mobileArr[1][0]),
			'qq'=>$this->formatQqMsn($qqArr[1][0]),
			'msn'=>$this->formatQqMsn($msnArr[1][0], 'MSN'),
			'email'=>$this->stripATags($emailArr[1][0]),
			'address'=>trim($addressArr[1][0]),
			'shopUrl'=>$url
		);
		return $infoArr;
	}

	/**
	 * 根据页面获取供应商 url 数组
	 * @param string $re
	 * @return ArrayObject
	 */
	private function shopUrlMatchReArr($re) {
		$re = preg_replace('/<img.* [\/]>/', '', $re);
		$re = preg_replace('/<img.*>/', '', $re);
		$re = preg_replace('/<a href=\".+\" target=\"\_blank\">[A-Z]<\/a>/', '', $re);
		$re = preg_replace('/<a href=\".+\" target=\"\_blank\">[0-9]<\/a>/', '', $re);
		$re = preg_replace('/<a href=\".+\" target=\"\_blank\">.*<\/a>/', '', $re);
		$re = preg_replace('/<a href="javascript.+">.*<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>营业执照<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>该企业更多资质>><\/a>/', '', $re);
		$re = preg_replace('/<a href.+>点此反馈<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>首页<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>IC<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>简洁<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>信用<\/a>/', '', $re);
		$re = preg_replace('/<a href.+>.*更多报价信息>><\/a>/', '', $re);
		$re = preg_replace('/<a href=\".*\" target=\"\_blank\" rel=\"nofollow\">.*<\/a>/', '', $re);
		$re = preg_replace('/<div class="kingbanan mb8">.*/', '', $re);
		preg_match_all('/<a href=\"(.+)\".*>.+<\/a>/Usi', $re, $arr);
		$arr = $this->formatUrlArr(array_unique($arr[1]));
		return $arr;
	}
	
	/**
	 * 格式化数组
	 * @param Array $arr
	 * @return ArrayObject
	 */
	private function formatUrlArr($arr) {
		$newArr = array();
		foreach ($arr as $key=>$value) {
			if ($this->isExistsHttp($value)) {
				$newArr[$key] = $value;
			}
		}
		return $newArr;
	}
	
	/**
	 * 格式化 QQ
	 * @param string $str
	 * @return string
	 */
	private function formatQqMsn($str, $e='QQ') {
		if (empty($str)) return '';
		preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);
		if (count($arr[1])==1) return $arr[1][0];
		$newStr = null;
		foreach ($arr[1] as $value) {
			$newStr .= $value . ' ';
		}
		return rtrim($newStr, ' ');
	}

	/**
	 * 供应商店铺链接添加 contact.html
	 * @param array $arr
	 * @return string	 
	 */
	private function shopAddContact($arr) {
		foreach ($arr as $k=>$v) {
			$arr[$k] = $v . '/contact.html';
		}
		return $arr;
	}

	/**
	 * 去掉网址的 A 标签
	 * @param string $site
	 * @return string
	 */
	private function stripATags($site) {
		$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
		return $site;
	}

	/**
	 * 检查 url 是否有 http
	 * @param string $url
	 * @return bool
	 */
	private function isExistsHttp($url) {
		if (stristr($url, 'http://')) {
			return true;
		} else {
			return false;
		}
	}
	
	/**
	 * 获取页面内容
	 * @param Number $page
	 * @return string
	 */
	private function getContent($page=1) {
		$re = file_get_contents($this->getUrl($this->key, $page));
		return $re;
	}
	
	/**
	 * 获取页码
	 * @return Number
	 */
	private function getPageNum() {
		$i = 1;
		while (true) {
			$re = $this->getContent($i);
			# 处理单页避免处理死循环
			if (!strstr($re, '下一页')) {
				break;
			} else {
				# 多页,计算出页码
				if (stristr($re, '<span>下一页</span></li>')) break;
				$i++;
			}
		}
		return $i;
	}

	/**
	 * 获取 URL 链接
	 * @param string $str
	 * @param int $page 页码
	 * @return string
	 */
	private function getUrl($str, $page=1) {
		return "http://www.dzsc.com/ic/sell_search.html?keyword={$str}&ic_sel=supplygoods&Submit=%26%23160%3B&page={$page}";
	}

	/**
	 * 获取页面内容
	 * @param string $url
	 * @return string
	 */
	private function getUrlInfo($url) {
		$re = file_get_contents($url);
		return $re;
	}
}

/*
程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息

数据库结构
CREATE TABLE `weiku` (
	`id` mediumint(8) unsigned NOT NULL auto_increment COMMENT 'ID',
	`company` varchar(300) default NULL COMMENT '公司名称',
	`person` varchar(200) default NULL COMMENT '联系人',
	`phone` varchar(300) default NULL COMMENT '电话',
	`fax` varchar(300) default NULL COMMENT '传真',
	`mobile` varchar(300) default NULL COMMENT '手机',
	`qq` varchar(200) default NULL COMMENT 'QQ',
	`msn` varchar(200) default NULL COMMENT 'MSN',
	`email` varchar(300) default NULL COMMENT '邮箱',
	`address` varchar(500) default NULL COMMENT '公司地址',
	`shopUrl` varchar(200) default NULL COMMENT '维库网店铺地址',
	PRIMARY KEY  (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=gb2312
*/

$k = new weiku();
$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));
foreach ($arr as $v) {
	$k->go($v);
}
?>


你可能感兴趣的:(抓取“维库电子市场”供应商程序)