PHP截取中文字符串,UTF-8、GBK
function substring($str, $start, $len) {
$tmpstr = "";
$strlen = $start + $len;
for($i = 0; $i < $strlen; $i++) {
if(ord(substr($str, $i, 1)) > 0xa0) {
$tmpstr .= substr($str, $i, 2);
$i++;
} else
$tmpstr .= substr($str, $i, 1);
}
return $tmpstr;
}
====================================
===========================================
/**
***@Author:LAD
***@URL :http://www.cnpik.com/" target="_blank">http://www.cnpik.com/
***@E_mail:[email protected]
***@随便用,不收钱 :-)
*/
function cnSubStr($string,$sublen)
{
if($sublen>=strlen($string))
{
return $string;
}
$s="";
for($i=0;$i<$sublen;$i++)
{
if(ord($string{$i})>127)
{
$s.=$string{$i}.$string{++$i};
continue;
}else{
$s.=$string{$i};
continue;
}
}
return $s;
}// End Function cnSubStr($string,$sublen)
/// Use like this :
echo "
__________________________
";
$string="242432反对感是456犯得上广泛大使馆地方7890";
$sublen=strlen($string);
$len=20;
echo $string."
";
echo "总长为:".($sublen+1)."
";
echo "截取数:".$len."
";
for($i=1;$i<=$sublen+1;$i++){
if($i>$len){
echo $i." → ".cnSubStr($string,$i)."…
";
continue;
}
echo $i." → ".cnSubStr($string,$i)."
";
}
?>
=======================================
因为证书中有中文,所以需要在PHP中进行GB2312与UTF-8的互换。
网上搜索一下这方面相关资料,说是需要php_iconv.dll的支持,可是我在PHP5文件夹中根本找不到这个文件,但是奇怪的是在PHP4中有这个,然后我将PHP4中的php_iconv.dll文件,复制到system32下,却提示出现错误,我想应该也不行,毕竟PHP4和PHP5里面的文件应该不兼容。到这里我就想删除了PHP5,装一个PHP4算了,后来发现一段话:iconv and libxml are compiled into php5ts.dll so you don't need the dll's in version 5.所以只要转换如下:
GB2312 -- UTF-8
iconv("GB2312","UTF-8",$text)
UTF-8 -- GB2312
iconv("UTF-8","GB2312",$text)
===========================
php5內建支援iconv,因此僅有php4的使用者需要安裝
1.首先必須確定你的php資料夾中有extensions,dlls這兩個資料夾
若沒有,請到 http://www.php.net/downloads.php 下載PHP 4.4.x zip package的版本(約7MB)
2.將extensions/php_iconv.dll , dlls/iconv.dll 這兩個檔案複製到windows的目錄下方
3.開啟php.ini(一般是在windows目錄下),將 ;extension=php_iconv.dll 前面的分號去除
4.重新啟動Apache即可
==========================================
【摘 要】 不管是uft-8编码转换为gb2312,还是将 gb2312 转换为 uft-8 ,都是一样道理,php4.3.1以后的iconv函数很好用的,只是需要自己写一个uft8到unicode的转换函数查表(gb2312.txt)也行
不管是uft-8编码转换为gb2312,还是将 gb2312 转换为 uft-8 ,都是一样道理,php4.3.1以后的iconv函数很好用的,只是需要自己写一个uft8到unicode的转换函数查表(gb2312.txt)也行。
$text = "电子书库";
preg_match_all("/[/x80-/xff]?./",$text,$ar);
foreach($ar[0] as $v)
echo "".utf8_unicode(iconv("GB2312","UTF-8",$v)).";";
?>
// utf8 -> unicode
function utf8_unicode($c) {
switch(strlen($c)) {
case 1:
return ord($c);
case 2:
$n = (ord($c[0]) & 0x3f) << 6;
$n += ord($c[1]) & 0x3f;
return $n;
case 3:
$n = (ord($c[0]) & 0x1f) << 12;
$n += (ord($c[1]) & 0x3f) << 6;
$n += ord($c[2]) & 0x3f;
return $n;
case 4:
$n = (ord($c[0]) & 0x0f) << 18;
$n += (ord($c[1]) & 0x3f) << 12;
$n += (ord($c[2]) & 0x3f) << 6;
$n += ord($c[3]) & 0x3f;
return $n;
}
}
?>
下面的例子是利用php将uft-8这中编码转换为gb2312.
$str = "TTL全天候自动聚焦";
$str = preg_replace("|([0-9]{1,5});|", "/".u2utf82gb(//1)./"", $str);
$str = "/$str=/"$str/";";
eval($str);
echo $str;
function u2utf82gb($c){
$str="";
if ($c < 0x80) {
$str.=$c;
} else if ($c < 0x800) {
$str.=chr(0xC0 | $c>>6);
$str.=chr(0x80 | $c & 0x3F);
} else if ($c < 0x10000) {
$str.=chr(0xE0 | $c>>12);
$str.=chr(0x80 | $c>>6 & 0x3F);
$str.=chr(0x80 | $c & 0x3F);
} else if ($c < 0x200000) {
$str.=chr(0xF0 | $c>>18);
$str.=chr(0x80 | $c>>12 & 0x3F);
$str.=chr(0x80 | $c>>6 & 0x3F);
$str.=chr(0x80 | $c & 0x3F);
}
return iconv('UTF-8', 'GB2312', $str);
}
?>
或者是
function unescape($str) {
$str = rawurldecode($str);
preg_match_all("/(?:%u.{4})|.{4};|/d+;|.+/U",$str,$r);
$ar = $r[0];
print_r($ar);
foreach($ar as $k=>$v) {
if(substr($v,0,2) == "%u")
$ar[$k] = iconv("UCS-2","GB2312",pack("H4",substr($v,-4)));
elseif(substr($v,0,3) == "")
$ar[$k] = iconv("UCS-2","GB2312",pack("H4",substr($v,3,-1)));
elseif(substr($v,0,2) == "") {
echo substr($v,2,-1)."
";
$ar[$k] = iconv("UCS-2","GB2312",pack("n",substr($v,2,-1)));
}
}
return join("",$ar);
}
$str = "TTL全天候自动聚焦";
echo unescape($str); file://out TTL全天候自动聚焦
利用javascript来转换
下面是一个显示所有全角半角的字体的查看例子
自定义: -
下面是一个查表(gb2312),转换gb2312到utf8的例子, 现在有iconv函数,这个已经没有太大的意义了,
function gb2utf8($gb){
if(!trim($gb)) return $gb;
$filename="gb2312.txt";
$tmp=file($filename);
$codetable=array();
while(list($key,$value)=each($tmp))
$codetable[hexdec(substr($value,0,6))]=substr($value,7,6);
$utf8="";
while($gb) {
if (ord(substr($gb,0,1))>127) {
$this=substr($gb,0,2);
$gb=substr($gb,2,strlen($gb)-2);
$utf8.=u2utf8(hexdec($codetable[hexdec(bin2hex($this))-0x8080]));
}else{
$this=substr($gb,0,1);
$gb=substr($gb,1,strlen($gb)-1);
$utf8.=u2utf8($this);
}
}
return $utf8;
}
function u2utf8($c){
$str="";
if ($c < 0x80) {
$str.=$c;
} else if ($c < 0x800) {
$str.=chr(0xC0 | $c>>6);
$str.=chr(0x80 | $c & 0x3F);
} else if ($c < 0x10000) {
$str.=chr(0xE0 | $c>>12);
$str.=chr(0x80 | $c>>6 & 0x3F);
$str.=chr(0x80 | $c & 0x3F);
} else if ($c < 0x200000) {
$str.=chr(0xF0 | $c>>18);
$str.=chr(0x80 | $c>>12 & 0x3F);
$str.=chr(0x80 | $c>>6 & 0x3F);
$str.=chr(0x80 | $c & 0x3F);
}
return $str;
}
?>
====================================
==============================================
以下代码试用于GB2312编码,截取中文字符串是PHP中一个头疼的问题,解决方法是根据值是否大于等于128来判断是否是双字节字符,以避免出现乱码的情况。但中英文混合、特殊符号等问题总是存在,现在写一个比较全面的,仅供参考:
程序说明:
1. len 参数以中文字符为标准,1len等于2个英文字符,为了形式上好看些
2. 如果将magic参数设为false,则中文和英文同等看待,取绝对的字符数
3. 特别适用于用htmlspecialchars()进行过编码的字符串
4. 能正确处理GB2312中实体字符模式()
程序代码:
function FSubstr($title,$start,$len="",$magic=true)
{
/**
* powered by Smartpig
* mailto:[email protected]
*/
$length = 0;
if($len == "") $len = strlen($title);
//判断起始为不正确位置
if($start > 0)
{
$cnum = 0;
for($i=0;$i<$start;$i++)
{
if(ord(substr($title,$i,1)) >= 128) $cnum ++;
}
if($cnum%2 != 0) $start--;
unset($cnum);
}
if(strlen($title)<=$len) return substr($title,$start,$len);
$alen = 0;
$blen = 0;
$realnum = 0;
for($i=$start;$i
$ctype = 0;
$cstep = 0;
$cur = substr($title,$i,1);
if($cur == "&")
{
if(substr($title,$i,4) == "<")
{
$cstep = 4;
$length += 4;
$i += 3;
$realnum ++;
if($magic)
{
$alen ++;
}
}
else if(substr($title,$i,4) == ">")
{
$cstep = 4;
$length += 4;
$i += 3;
$realnum ++;
if($magic)
{
$alen ++;
}
}
else if(substr($title,$i,5) == "&")
{
$cstep = 5;
$length += 5;
$i += 4;
$realnum ++;
if($magic)
{
$alen ++;
}
}
else if(substr($title,$i,6) == """)
{
$cstep = 6;
$length += 6;
$i += 5;
$realnum ++;
if($magic)
{
$alen ++;
}
}
else if(substr($title,$i,6) == "'")
{
$cstep = 6;
$length += 6;
$i += 5;
$realnum ++;
if($magic)
{
$alen ++;
}
}
else if(preg_match("/(/d+);/i",substr($title,$i,8),$match))
{
$cstep = strlen($match[0]);
$length += strlen($match[0]);
$i += strlen($match[0])-1;
$realnum ++;
if($magic)
{
$blen ++;
$ctype = 1;
}
}
}else{
if(ord($cur)>=128)
{
$cstep = 2;
$length += 2;
$i += 1;
$realnum ++;
if($magic)
{
$blen ++;
$ctype = 1;
}
}else{
$cstep = 1;
$length +=1;
$realnum ++;
if($magic)
{
$alen++;
}
}
}
if($magic)
{
if(($blen*2+$alen) == ($len*2)) break;
if(($blen*2+$alen) == ($len*2+1))
{
if($ctype == 1)
{
$length -= $cstep;
break;
}else{
break;
}
}
}else{
if($realnum == $len) break;
}
}
unset($cur);
unset($alen);
unset($blen);
unset($realnum);
unset($ctype);
unset($cstep);
return substr($title,$start,$length);
}