java字符串编码类型获取

源码下载地址:http://download.csdn.net/source/414086

汉字编码是一项较为麻烦的事情,弄不好就会造出些谁都看不懂的乱码。比如我想做个针对汉字网站的爬虫系统,需要对非特定的页面进行数据解析处理,而此时我所访问的页面编码格式未知,如果不能正确处理页面编码,则很难获得我们理想中的数据。

通常这时候可能有几种选择:

一是根据response的ContentType获得,如果服务器支持的话此项中会返回charset数值,解析即可。但对不返回或者不支持的服务器则无能为力。

二是使用正则或自定义解析函数截取页面中‘charset=’后的数据,采取死钉战术,但万一采集的页面中没有此项或者此项有错,也就回天乏术。

三就是老老实实的解析全文,最后返回一个符合的编码格式。

此例中我演示了几种较常见编码的识别方法,通过统计编码为指定编码的或然率, 而后返回可能性最高的编码方式。在无法获得确切编码之时,这可说是一种唯一的选择。

这种识别方式主要是针对汉字编码而来,所以对应页面中的汉字数目越多,统计结果就越准确,反之则很难识别出正确结果。

Encoding.java
package org.loon.test.encoding;

/***/ /**
*<p>
*Title:LoonFramework
*</p>
*<p>
*Description:编码基本类型集合
*</p>
*<p>
*Copyright:Copyright(c)2008
*</p>
*<p>
*Company:LoonFramework
*</p>
*<p>
*License:
http://www.apache.org/licenses/LICENSE-2.0
*</p>
*
*
@authorchenpeng
*@email:[email protected]
*
@version0.1
*/

public class Encoding ... {

//支持的字符格式
publicstaticintGB2312=0;

publicstaticintGBK=1;

publicstaticintBIG5=2;

publicstaticintUTF8=3;

publicstaticintUNICODE=4;

publicstaticintEUC_KR=5;

publicstaticintSJIS=6;

publicstaticintEUC_JP=7;

publicstaticintASCII=8;

publicstaticintUNKNOWN=9;

publicstaticintTOTALT=10;

publicfinalstaticintSIMP=0;

publicfinalstaticintTRAD=1;

//解析名称用
publicstaticString[]javaname;

//编码用
publicstaticString[]nicename;

//应用于html中的字符集
publicstaticString[]htmlname;

publicEncoding()...{
javaname
=newString[TOTALT];
nicename
=newString[TOTALT];
htmlname
=newString[TOTALT];
javaname[GB2312]
="GB2312";
javaname[GBK]
="GBK";
javaname[BIG5]
="BIG5";
javaname[UTF8]
="UTF8";
javaname[UNICODE]
="Unicode";
javaname[EUC_KR]
="EUC_KR";
javaname[SJIS]
="SJIS";
javaname[EUC_JP]
="EUC_JP";
javaname[ASCII]
="ASCII";
javaname[UNKNOWN]
="ISO8859_1";

//分配编码名称
htmlname[GB2312]="GB2312";
htmlname[GBK]
="GBK";
htmlname[BIG5]
="BIG5";
htmlname[UTF8]
="UTF-8";
htmlname[UNICODE]
="UTF-16";
htmlname[EUC_KR]
="EUC-KR";
htmlname[SJIS]
="Shift_JIS";
htmlname[EUC_JP]
="EUC-JP";
htmlname[ASCII]
="ASCII";
htmlname[UNKNOWN]
="ISO8859-1";

//分配可读名称
nicename[GB2312]="GB-2312";
nicename[GBK]
="GBK";
nicename[BIG5]
="Big5";
nicename[UTF8]
="UTF-8";
nicename[UNICODE]
="Unicode";
nicename[EUC_KR]
="EUC-KR";
nicename[SJIS]
="Shift-JIS";
nicename[EUC_JP]
="EUC-JP";
nicename[ASCII]
="ASCII";
nicename[UNKNOWN]
="UNKNOWN";

}


publicStringtoEncoding(finalinttype)...{
return(javaname[type]+","+nicename[type]+","+htmlname[type])
.intern();
}



}


Encode,java(省略,见源码)

ParseEncoding.java
package org.loon.test.encoding;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;

/***/ /**
*<p>
*Title:LoonFramework
*</p>
*<p>
*Description:
*</p>
*<p>
*Copyright:Copyright(c)2008
*</p>
*<p>
*Company:LoonFramework
*</p>
*<p>
*License:
http://www.apache.org/licenses/LICENSE-2.0
*</p>
*
*
@authorchenpeng
*@email:[email protected]
*
@version0.1
*/

public class ParseEncoding extends Encode ... {

publicParseEncoding()...{
super();
GB2312format
=newint[94][94];
GBKformat
=newint[126][191];
Big5format
=newint[94][158];
EUC_KRformat
=newint[94][94];
JPformat
=newint[94][94];

//初始化编码格式
init();
}


publicStringgetEncoding(finalStringpath)...{
returncheck(getEncodeValue(path));
}


publicStringgetEncoding(finalInputStreamin)...{
returncheck(getEncodeValue(in));
}


publicStringgetEncoding(finalbyte[]buffer)...{
returncheck(getEncodeValue(buffer));
}


publicStringgetEncoding(finalURLurl)...{
returncheck(getEncodeValue(url));
}


privateStringcheck(finalintresult)...{
if(result==-1)...{
returnnicename[UNKNOWN];
}

returnnicename[result];
}


/***//**
*解析指定字符串路径编码所用格式
*
*
@parampath
*
@return
*/

privateintgetEncodeValue(Stringpath)...{
intexpress=UNKNOWN;
if(path.startsWith("http://"))...{
try...{
express
=getEncodeValue(newURL(path));
}
catch(MalformedURLExceptione)...{
express
=-1;
}

}
else...{
express
=getEncodeValue(newFile(path));
}

returnexpress;
}


/***//**
*
*解析指定InputStream所用编码,返回或然率最高的编码类型数值
*
*
@paramin
*
@return
*/

publicintgetEncodeValue(InputStreamin)...{
byte[]rawtext=newbyte[8192];
intbytesread=0,byteoffset=0;
intexpress=UNKNOWN;
InputStreamstream
=in;
try...{
while((bytesread=stream.read(rawtext,byteoffset,rawtext.length
-byteoffset))>0)...{
byteoffset
+=bytesread;
}

;
stream.close();
express
=getEncodeValue(rawtext);
}
catch(Exceptione)...{
express
=-1;
}

returnexpress;
}


/***//**
*解析指定url下数据所用编码,返回或然率最高的编码类型数值
*
*
@paramurl
*
@return
*/

publicintgetEncodeValue(URLurl)...{

InputStreamstream;
try...{
stream
=url.openStream();
}
catch(IOExceptione)...{
stream
=null;
}


returngetEncodeValue(stream);
}


/***//**
*解析指定file所用编码,返回或然率最高的编码类型数值
*
*
@paramfile
*
@return
*/

publicintgetEncodeValue(Filefile)...{
byte[]buffer;
try...{
buffer
=read(newFileInputStream(file));
}
catch(FileNotFoundExceptione)...{
buffer
=null;
}

returngetEncodeValue(buffer);
}


/***//**
*将inputstream转为byte[]
*
*
@paraminputStream
*
@return
*/

privatefinalbyte[]read(finalInputStreaminputStream)...{
byte[]arrayByte=null;
ByteArrayOutputStreambyteArrayOutputStream
=newByteArrayOutputStream();
byte[]bytes=newbyte[8192];
try...{
bytes
=newbyte[inputStream.available()];
intread;
while((read=inputStream.read(bytes))>=0)...{
byteArrayOutputStream.write(bytes,
0,read);
}

arrayByte
=byteArrayOutputStream.toByteArray();
}
catch(IOExceptione)...{
returnnull;
}

returnarrayByte;
}


/***//**
*解析指定byte[]所用编码,返回或然率最高的数值类型
*
*
@paramcontent
*
@return
*/

publicintgetEncodeValue(byte[]content)...{
if(content==null)
return-1;
int[]scores;
intindex,maxscore=0;
intencoding=UNKNOWN;
scores
=newint[TOTALT];
//分配或然率
scores[GB2312]=gb2312probability(content);
scores[GBK]
=gbkprobability(content);
scores[BIG5]
=big5probability(content);
scores[UTF8]
=utf8probability(content);
scores[UNICODE]
=utf16probability(content);
scores[EUC_KR]
=euc_krprobability(content);
scores[ASCII]
=asciiprobability(content);
scores[SJIS]
=sjisprobability(content);
scores[EUC_JP]
=euc_jpprobability(content);
scores[UNKNOWN]
=0;

//概率比较
for(index=0;index<TOTALT;index++)...{
if(scores[index]>maxscore)...{
//索引
encoding=index;
//最大几率
maxscore=scores[index];
}

}

//返回或然率大于50%的数据
if(maxscore<=50)...{
encoding
=UNKNOWN;
}

returnencoding;
}


/***//**
*gb2312数据或然率计算
*
*
@paramcontent
*
@return
*/

privateintgb2312probability(byte[]content)...{
inti,rawtextlen=0;

intdbchars=1,gbchars=1;
longgbformat=0,totalformat=1;
floatrangeval=0,formatval=0;
introw,column;

//检查是否在亚洲汉字范围内
rawtextlen=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
//汉字GB码由两个字节组成,每个字节的范围是0xA1~0xFE
if((byte)0xA1<=content[i]&&content[i]<=(byte)0xF7
&&(byte)0xA1<=content[i+1]
&&content[i+1]<=(byte)0xFE)...{
gbchars
++;
totalformat
+=500;
row
=content[i]+256-0xA1;
column
=content[i+1]+256-0xA1;
if(GB2312format[row][column]!=0)...{
gbformat
+=GB2312format[row][column];
}
elseif(15<=row&&row<55)...{
//在gb编码范围
gbformat+=200;
}


}

i
++;
}

}

rangeval
=50*((float)gbchars/(float)dbchars);
formatval
=50*((float)gbformat/(float)totalformat);

return(int)(rangeval+formatval);
}


/***//**
*gb2312或然率计算
*
*
@paramcontent
*
@return
*/

privateintgbkprobability(byte[]content)...{
inti,rawtextlen=0;

intdbchars=1,gbchars=1;
longgbformat=0,totalformat=1;
floatrangeval=0,formatval=0;
introw,column;
rawtextlen
=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
if((byte)0xA1<=content[i]&&content[i]<=(byte)0xF7
&&//gb范围
(byte)0xA1<=content[i+1]
&&content[i+1]<=(byte)0xFE)...{
gbchars
++;
totalformat
+=500;
row
=content[i]+256-0xA1;
column
=content[i+1]+256-0xA1;
if(GB2312format[row][column]!=0)...{
gbformat
+=GB2312format[row][column];
}
elseif(15<=row&&row<55)...{
gbformat
+=200;
}


}
elseif((byte)0x81<=content[i]
&&content[i]<=(byte)0xFE&&//gb扩展区域
(((byte)0x80<=content[i+1]&&content[i+1]<=(byte)0xFE)||((byte)0x40<=content[i+1]&&content[i+1]<=(byte)0x7E)))...{
gbchars
++;
totalformat
+=500;
row
=content[i]+256-0x81;
if(0x40<=content[i+1]&&content[i+1]<=0x7E)...{
column
=content[i+1]-0x40;
}
else...{
column
=content[i+1]+256-0x40;
}

if(GBKformat[row][column]!=0)...{
gbformat
+=GBKformat[row][column];
}

}

i
++;
}

}

rangeval
=50*((float)gbchars/(float)dbchars);
formatval
=50*((float)gbformat/(float)totalformat);
return(int)(rangeval+formatval)-1;
}


/***//**
*解析为big5的或然率
*
*
@paramcontent
*
@return
*/

privateintbig5probability(byte[]content)...{
inti,rawtextlen=0;
intdbchars=1,bfchars=1;
floatrangeval=0,formatval=0;
longbfformat=0,totalformat=1;
introw,column;
rawtextlen
=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
if((byte)0xA1<=content[i]
&&content[i]<=(byte)0xF9
&&(((byte)0x40<=content[i+1]&&content[i+1]<=(byte)0x7E)||((byte)0xA1<=content[i+1]&&content[i+1]<=(byte)0xFE)))...{
bfchars
++;
totalformat
+=500;
row
=content[i]+256-0xA1;
if(0x40<=content[i+1]&&content[i+1]<=0x7E)...{
column
=content[i+1]-0x40;
}
else...{
column
=content[i+1]+256-0x61;
}

if(Big5format[row][column]!=0)...{
bfformat
+=Big5format[row][column];
}
elseif(3<=row&&row<=37)...{
bfformat
+=200;
}

}

i
++;
}

}

rangeval
=50*((float)bfchars/(float)dbchars);
formatval
=50*((float)bfformat/(float)totalformat);

return(int)(rangeval+formatval);
}


/***//**
*在utf-8中的或然率
*
*
@paramcontent
*
@return
*/

privateintutf8probability(byte[]content)...{
intscore=0;
inti,rawtextlen=0;
intgoodbytes=0,asciibytes=0;
//检查是否为汉字可接受范围
rawtextlen=content.length;
for(i=0;i<rawtextlen;i++)...{
if((content[i]&(byte)0x7F)==content[i])...{
asciibytes
++;
}
elseif(-64<=content[i]&&content[i]<=-33
&&i+1<rawtextlen&&-128<=content[i+1]
&&content[i+1]<=-65)...{
goodbytes
+=2;
i
++;
}
elseif(-32<=content[i]&&content[i]<=-17
&&i+2<rawtextlen&&-128<=content[i+1]
&&content[i+1]<=-65&&-128<=content[i+2]
&&content[i+2]<=-65)...{
goodbytes
+=3;
i
+=2;
}

}


if(asciibytes==rawtextlen)...{
return0;
}


score
=(int)(100*((float)goodbytes/(float)(rawtextlen-asciibytes)));
//如果不高于98则减少到零
if(score>98)...{
returnscore;
}
elseif(score>95&&goodbytes>30)...{
returnscore;
}
else...{
return0;
}


}


/***//**
*检查为utf-16的或然率
*
*
@paramcontent
*
@return
*/

privateintutf16probability(byte[]content)...{

if(content.length>1
&&((byte)0xFE==content[0]&&(byte)0xFF==content[1])
||((byte)0xFF==content[0]&&(byte)0xFE==content[1]))...{
return100;
}

return0;
}


/***//**
*检查为ascii的或然率
*
*
@paramcontent
*
@return
*/

privateintasciiprobability(byte[]content)...{
intscore=75;
inti,rawtextlen;

rawtextlen
=content.length;

for(i=0;i<rawtextlen;i++)...{
if(content[i]<0)...{
score
=score-5;
}
elseif(content[i]==(byte)0x1B)...{//ESC(usedbyISO2022)
score=score-5;
}

if(score<=0)...{
return0;
}

}

returnscore;
}


/***//**
*检查为euc_kr的或然率
*
*
@paramcontent
*
@return
*/

privateinteuc_krprobability(byte[]content)...{
inti,rawtextlen=0;

intdbchars=1,krchars=1;
longkrformat=0,totalformat=1;
floatrangeval=0,formatval=0;
introw,column;
rawtextlen
=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
if((byte)0xA1<=content[i]&&content[i]<=(byte)0xFE
&&(byte)0xA1<=content[i+1]
&&content[i+1]<=(byte)0xFE)...{
krchars
++;
totalformat
+=500;
row
=content[i]+256-0xA1;
column
=content[i+1]+256-0xA1;
if(EUC_KRformat[row][column]!=0)...{
krformat
+=EUC_KRformat[row][column];
}
elseif(15<=row&&row<55)...{
krformat
+=0;
}


}

i
++;
}

}

rangeval
=50*((float)krchars/(float)dbchars);
formatval
=50*((float)krformat/(float)totalformat);

return(int)(rangeval+formatval);
}


privateinteuc_jpprobability(byte[]content)...{
inti,rawtextlen=0;

intdbchars=1,jpchars=1;
longjpformat=0,totalformat=1;
floatrangeval=0,formatval=0;
introw,column;

rawtextlen
=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
if((byte)0xA1<=content[i]&&content[i]<=(byte)0xFE
&&(byte)0xA1<=content[i+1]
&&content[i+1]<=(byte)0xFE)...{
jpchars
++;
totalformat
+=500;
row
=content[i]+256-0xA1;
column
=content[i+1]+256-0xA1;
if(JPformat[row][column]!=0)...{
jpformat
+=JPformat[row][column];
}
elseif(15<=row&&row<55)...{
jpformat
+=0;
}


}

i
++;
}

}

rangeval
=50*((float)jpchars/(float)dbchars);
formatval
=50*((float)jpformat/(float)totalformat);

return(int)(rangeval+formatval);
}


privateintsjisprobability(byte[]content)...{
inti,rawtextlen=0;

intdbchars=1,jpchars=1;
longjpformat=0,totalformat=1;
floatrangeval=0,formatval=0;
introw,column,adjust;

rawtextlen
=content.length;
for(i=0;i<rawtextlen-1;i++)...{
if(content[i]>=0)...{
}
else...{
dbchars
++;
if(i+1<content.length
&&(((byte)0x81<=content[i]&&content[i]<=(byte)0x9F)||((byte)0xE0<=content[i]&&content[i]<=(byte)0xEF))
&&(((byte)0x40<=content[i+1]&&content[i+1]<=(byte)0x7E)||((byte)0x80<=content[i+1]&&content[i+1]<=(byte)0xFC)))...{
jpchars
++;
totalformat
+=500;
row
=content[i]+256;
column
=content[i+1]+256;
if(column<0x9f)...{
adjust
=1;
if(column>0x7f)...{
column
-=0x20;
}
else...{
column
-=0x19;
}

}
else...{
adjust
=0;
column
-=0x7e;
}

if(row<0xa0)...{
row
=((row-0x70)<<1)-adjust;
}
else...{
row
=((row-0xb0)<<1)-adjust;
}


row
-=0x20;
column
=0x20;
if(row<JPformat.length&&column<JPformat[row].length
&&JPformat[row][column]!=0)...{
jpformat
+=JPformat[row][column];
}

i
++;
}
elseif((byte)0xA1<=content[i]
&&content[i]<=(byte)0xDF)...{
}


}

}

rangeval
=50*((float)jpchars/(float)dbchars);
formatval
=50*((float)jpformat/(float)totalformat);

return(int)(rangeval+formatval)-1;
}


}



EncodingTest.java
package org.loon.test.encoding;
/***/ /**
*<p>Title:LoonFramework</p>
*<p>Description:</p>
*<p>Copyright:Copyright(c)2008</p>
*<p>Company:LoonFramework</p>
*<p>License:
http://www.apache.org/licenses/LICENSE-2.0</p>
*
@authorchenpeng
*@email:[email protected]
*
@version0.1
*/

public class EncodingTest ... {
publicstaticvoidmain(Stringargc[])...{
ParseEncodingparse;

parse
=newParseEncoding();

System.out.println(
"中国大陆:");
System.out.println(
"测试字符串,编码格式="+parse.getEncoding("百度".getBytes()));
System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.baidu.com"));
System.out.println();
System.out.println(
"中国台湾:");
System.out.println(
"测试字符串,编码格式="+parse.getEncoding("い地チ瓣".getBytes()));
System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://tw.yahoo.com/"));
System.out.println(
"测试站点(繁体字,UTF编码),编码格式="+parse.getEncoding("http://www.javaworld.com.tw/jute"));
System.out.println();
System.out.println(
"日本:");
System.out.println(
"测试字符串,编码格式="+parse.getEncoding("その機能".getBytes()));
System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.4gamer.net"));
System.out.println();
System.out.println(
"自称蚩尤后代那群……:");
System.out.println(
"测试站点,编码格式="+parse.getEncoding("http://www.easyjava.co.kr/"));

}

}



输出结果:
java字符串编码类型获取


你可能感兴趣的:(java)