阿里巴巴英文供应商分类采集

最近一个项目 需要采集阿里巴巴所有的英文分类,经过分析 写了如下代码 一次性采集阿里巴巴所有英文分类
表结构代码:
CREATE TABLE IF NOT EXISTS `cate` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `name` varchar(255) NOT NULL,
  `alias` varchar(255) NOT NULL,
  `alibaba_id` varchar(255) NOT NULL,
  `alibaba_pid` varchar(255) NOT NULL,
  `level` int(2) NOT NULL,
  `status` int(2) NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`),
  UNIQUE KEY `alias` (`alias`)
) ENGINE=InnoDB  DEFAULT CHARSET=utf8 AUTO_INCREMENT=3870 ;

 
 
 
  
  
  
  
<?php 
class CateCommand extends CConsoleCommand
{
 public $conn;
    public function run()
    {
    	$this->conn=mysql_connect('192.168.1.104', 'root','');
    	mysql_select_db('us',$this->conn);
    	$strsql2="set names utf8";
    	mysql_query($strsql2, $this->conn);
    
    	$url="http://www.alibaba.com/Agriculture_s1";
    	$home_page="http://www.alibaba.com/companies";
    
    	Yii::import('application.vendors.*');
    	require_once("Snoopy.class.php");
    	require_once("simple.html.dom.php");
    
    	$snoopy=new Snoopy();
    
    	$snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)"; //伪装浏览器
    	$snoopy->fetch($home_page);
    	$html = str_get_html($content=$snoopy->results);
    	$p_cate=array();
    	foreach($html->find('dt') as $element)
    	{
    		$tmp=array();
    		$sid=explode("_s", $element->find('a',0)->href);
    		$tmp['pid']=0;
    		$tmp['level']=1;
    		$tmp['id']=$sid[1];
    		$tmp['name']=htmlspecialchars_decode($element->find('a',0)->innertext);
    		$tmp['href']=$element->find('a',0)->href;
    		$p_cate[]=$tmp;
    	}
    	$this->save_cate($p_cate);
    	foreach ($p_cate as $p)
    	{
    		$snoopy->fetch($p['href']);
    		$html = str_get_html($content=$snoopy->results);
    		$main_cate=array();
    		foreach($html->find('div[class=g-float-left g-col-left-3]') as $element)
    		{
    			foreach ($element->find('ul',0)->children as $a)
    			{
    				if($a->children(0)->tag=="a")
    				{
    					$tmp=array();
    					$sid=explode("_sid", $a->children(0)->href);
    					$tmp['pid']=$p['id'];
    					$tmp['level']=2;
    					$tmp['id']=$sid[1];
    					$tmp['name']=htmlspecialchars_decode($a->children(0)->innertext);
    					$main_cate[]=$tmp;
    				}
    				elseif($a->children(0)->tag=="ul")
    				{
    					$k3=count($main_cate)-1;
    					$pid3=$main_cate[$k3]['id'];
    					foreach ($a->find('ul',0)->children as $ul)
    					{
    						if($ul->children(0)->tag=="a")
    						{
    							$tmp=array();
    							$sid=explode("_sid", $ul->children(0)->href);
    							$tmp['pid']=$pid3;
    							$tmp['level']=3;
    							$tmp['id']=$sid[1];
    							$tmp['name']=htmlspecialchars_decode($ul->children(0)->innertext);
    							$main_cate[]=$tmp;
    						}
    						elseif($ul->children(0)->tag=="ul")
    						{
    							$k4=count($main_cate)-1;
    							$pid4=$main_cate[$k4]['id'];
    							foreach ($ul->find('ul',0)->children as $ul2)
    							{
    								if($ul2->children(0)->tag=="a")
    								{
    									$tmp=array();
    									$sid=explode("_sid", $ul2->children(0)->href);
    									$tmp['pid']=$pid4;
    									$tmp['level']=4;
    									$tmp['id']=$sid[1];
    									$tmp['name']=htmlspecialchars_decode($ul2->children(0)->innertext);
    									$main_cate[]=$tmp;
    								}
    							}
    						}
    					}
    				}
    			}
    		}
    		$this->save_cate($main_cate);
    
    	}
    
    	echo 'over';
    	exit;
    }
    
    public function save_cate($array_cate)
    {
    	foreach($array_cate as $array)
    	{
    		$alias = SeoTools::friendlyURL($array['name']);
    		$strsql="SELECT * FROM  `cate` where alias ='".$alias."' limit 1";
    		$result = mysql_query($strsql,$this->conn);
    		$data2=mysql_fetch_array($result);
    		if($data2)
    		{
    			echo $alias."_".$array['name'].'_'.$array['id'].'_'.$array['pid'].' existed'.chr(10);
    			continue;
    		}
    		else
    		{
    			$insert_sql="insert into `cate`(`name`,`alias`,`alibaba_id`,`alibaba_pid`,`level`,`status`) values('".$array['name']."','".$alias."','".$array['id']."','".$array['pid']."','".$array['level']."','0')";


    			mysql_query($insert_sql,$this->conn);
    	//		echo $alias.' ok'.chr(10);
    		}
    	}


    }
    

} 
来自  好商网

你可能感兴趣的:(阿里巴巴英文供应商分类采集)