<?php /** * [Discuz!] (C)2001-2099 Comsenz Inc. * This is NOT a freeware, use is subject to license terms * * $Id: class_chinese.php 6757 2010-03-25 09:01:29Z cnteacher $ */ define('CODETABLE_DIR', DISCUZ_ROOT.'./source/include/table/'); class Chinese { var $table = ''; var $iconv_enabled = false; var $convertbig5 = false; var $unicode_table = array(); var $config = array ( 'SourceLang' => '', 'TargetLang' => '', 'GBtoUnicode_table' => 'gb-unicode.table', 'BIG5toUnicode_table' => 'big5-unicode.table', 'GBtoBIG5_table' => 'gb-big5.table', ); function Chinese($SourceLang, $TargetLang, $ForceTable = FALSE) { $this->config['SourceLang'] = $this->_lang($SourceLang); $this->config['TargetLang'] = $this->_lang($TargetLang); if(ICONV_ENABLE && $this->config['TargetLang'] != 'BIG5' && !$ForceTable) { $this->iconv_enabled = true; } else { $this->iconv_enabled = false; $this->OpenTable(); } } function _lang($LangCode) { $LangCode = strtoupper($LangCode); if(substr($LangCode, 0, 2) == 'GB') { return 'GBK'; } elseif(substr($LangCode, 0, 3) == 'BIG') { return 'BIG5'; } elseif(substr($LangCode, 0, 3) == 'UTF') { return 'UTF-8'; } elseif(substr($LangCode, 0, 3) == 'UNI') { return 'UNICODE'; } } function _hex2bin($hexdata) { for($i=0; $i < strlen($hexdata); $i += 2) { $bindata .= chr(hexdec(substr($hexdata, $i, 2))); } return $bindata; } function OpenTable() { $this->unicode_table = array(); if(!$this->iconv_enabled && $this->config['TargetLang'] == 'BIG5') { $this->config['TargetLang'] = 'GBK'; $this->convertbig5 = TRUE; } if($this->config['SourceLang'] == 'GBK' || $this->config['TargetLang'] == 'GBK') { $this->table = CODETABLE_DIR.$this->config['GBtoUnicode_table']; } elseif($this->config['SourceLang'] == 'BIG5' || $this->config['TargetLang'] == 'BIG5') { $this->table = CODETABLE_DIR.$this->config['BIG5toUnicode_table']; } $fp = fopen($this->table, 'rb'); $tabletmp = fread($fp, filesize($this->table)); for($i = 0; $i < strlen($tabletmp); $i += 4) { $tmp = unpack('nkey/nvalue', substr($tabletmp, $i, 4)); if($this->config['TargetLang'] == 'UTF-8') { $this->unicode_table[$tmp['key']] = '0x'.dechex($tmp['value']); } elseif($this->config['SourceLang'] == 'UTF-8') { $this->unicode_table[$tmp['value']] = '0x'.dechex($tmp['key']); } elseif($this->config['TargetLang'] == 'UNICODE') { $this->unicode_table[$tmp['key']] = dechex($tmp['value']); } } } function CHSUtoUTF8($c) { $str = ''; if($c < 0x80) { $str .= $c; } elseif($c < 0x800) { $str .= (0xC0 | $c >> 6); $str .= (0x80 | $c & 0x3F); } elseif($c < 0x10000) { $str .= (0xE0 | $c >> 12); $str .= (0x80 | $c >> 6 & 0x3F); $str .=( 0x80 | $c & 0x3F); } elseif($c < 0x200000) { $str .= (0xF0 | $c >> 18); $str .= (0x80 | $c >> 12 & 0x3F); $str .= (0x80 | $c >> 6 & 0x3F); $str .= (0x80 | $c & 0x3F); } return $str; } function GB2312toBIG5($c) { $f = fopen(CODETABLE_DIR.$this->config['GBtoBIG5_table'], 'r'); $max=strlen($c)-1; for($i = 0;$i < $max;$i++){ $h=ord($c[$i]); if($h>=160) { $l=ord($c[$i+1]); if($h==161 && $l==64){ $gb=" "; } else{ fseek($f,($h-160)*510+($l-1)*2); $gb=fread($f,2); } $c[$i]=$gb[0]; $c[$i+1]=$gb[1]; $i++; } } $result = $c; return $result; } function Convert($SourceText) { if($this->config['SourceLang'] == $this->config['TargetLang']) { return $SourceText; } elseif($this->iconv_enabled) { if($this->config['TargetLang'] <> 'UNICODE') { return iconv($this->config['SourceLang'], $this->config['TargetLang'], $SourceText); } else { $return = ''; while($SourceText != '') { if(ord(substr($SourceText, 0, 1)) > 127) { $return .= "&#x".dechex($this->Utf8_Unicode(iconv($this->config['SourceLang'],"UTF-8", substr($SourceText, 0, 2)))).";"; $SourceText = substr($SourceText, 2, strlen($SourceText)); } else { $return .= substr($SourceText, 0, 1); $SourceText = substr($SourceText, 1, strlen($SourceText)); } } return $return; } } elseif($this->config['TargetLang'] == 'UNICODE') { $utf = ''; while($SourceText != '') { if(ord(substr($SourceText, 0, 1)) > 127) { if($this->config['SourceLang'] == 'GBK') { $utf .= '&#x'.$this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2))) - 0x8080].';'; } elseif($this->config['SourceLang'] == 'BIG5') { $utf .= '&#x'.$this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2)))].';'; } $SourceText = substr($SourceText, 2, strlen($SourceText)); } else { $utf .= substr($SourceText, 0, 1); $SourceText = substr($SourceText, 1, strlen($SourceText)); } } return $utf; } else { $ret = ''; if($this->config['SourceLang'] == 'UTF-8') { $out = ''; $len = strlen($SourceText); $i = 0; while($i < $len) { $c = ord(substr($SourceText, $i++, 1)); switch($c >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: $out .= substr($SourceText, $i - 1, 1); break; case 12: case 13: $char2 = ord(substr($SourceText, $i++, 1)); $char3 = $this->unicode_table[(($c & 0x1F) << 6) | ($char2 & 0x3F)]; if($this->config['TargetLang'] == 'GBK') { $out .= $this->_hex2bin(dechex($char3 + 0x8080)); } elseif($this->config['TargetLang'] == 'BIG5') { $out .= $this->_hex2bin($char3); } break; case 14: $char2 = ord(substr($SourceText, $i++, 1)); $char3 = ord(substr($SourceText, $i++, 1)); $char4 = $this->unicode_table[(($c & 0x0F) << 12) | (($char2 & 0x3F) << 6) | (($char3 & 0x3F) << 0)]; if($this->config['TargetLang'] == 'GBK') { $out .= $this->_hex2bin(dechex($char4 + 0x8080)); } elseif($this->config['TargetLang'] == 'BIG5') { $out .= $this->_hex2bin($char4); } break; } } return !$this->convertbig5 ? $out : $this->GB2312toBIG5($out); } else { while($SourceText != '') { if(ord(substr($SourceText, 0, 1)) > 127) { if($this->config['SourceLang'] == 'BIG5') { $utf8 = $this->CHSUtoUTF8(hexdec($this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2)))])); } elseif($this->config['SourceLang'] == 'GBK') { $utf8=$this->CHSUtoUTF8(hexdec($this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2))) - 0x8080])); } for($i = 0; $i < strlen($utf8); $i += 3) { $ret .= chr(substr($utf8, $i, 3)); } $SourceText = substr($SourceText, 2, strlen($SourceText)); } else { $ret .= substr($SourceText, 0, 1); $SourceText = substr($SourceText, 1, strlen($SourceText)); } } $SourceText = ''; return $ret; } } } function Utf8_Unicode($char) { switch(strlen($char)) { case 1: return ord($char); case 2: $n = (ord($char[0]) & 0x3f) << 6; $n += ord($char[1]) & 0x3f; return $n; case 3: $n = (ord($char[0]) & 0x1f) << 12; $n += (ord($char[1]) & 0x3f) << 6; $n += ord($char[2]) & 0x3f; return $n; case 4: $n = (ord($char[0]) & 0x0f) << 18; $n += (ord($char[1]) & 0x3f) << 12; $n += (ord($char[2]) & 0x3f) << 6; $n += ord($char[3]) & 0x3f; return $n; } } } ?>
转化类用的数据库 source/include/table gb-unicode.table
$c = new Chinese('utf8', CHARSET, TRUE);
$data = $c->Convert($data);