HTML源代码字符转换

   在浏览器中,我们打开一个网页,比如http://www.google.cn/music/album?id=B32ac5ed507934bbf,

然后点击菜单中的“查看”按钮,我们就可以看到网页的源代码(当然这个源代码与通过HttpURLConnectionn

拿到的没那么完全),我们取一小段来看:

   </span> 单曲 - 因为爱情 - 谷歌音乐搜索<span class="webkit-html-tag">

我们取一小段来看:这些单是什么意思,这些都是汉字的Unicode,我们需要将其解析出来才知道它代表的

含义。下面是解析它的Java代码:

    /** * @author tinyfun * @function 将网页源代码中的汉字Unicode码转换成汉字 */ public class HtmlUnicode { private static HtmlUnicode m_instance = null; private HtmlUnicode(){ } public static HtmlUnicode getInstance(){ if(m_instance == null){ m_instance = new HtmlUnicode(); } return m_instance; } //除了处理汉字外,还加上了处理空格,&,","等符号 public String decodeUnicode(String strData) { int start = 0; int end = 0; final StringBuffer buffer = new StringBuffer(); strData = replace(strData,"&","&"); strData = replace(strData," "," "); strData = replace(strData,"“","”"); strData = replace(strData,"”","“"); while (start > -1) { int system = 10;//进制 if(start==0){ int t = strData.indexOf("&#"); if(start!=t){ start = t; } if(t != -1){ buffer.append(strData.substring(0,t)); } } end = strData.indexOf(";", start + 2); String charStr = ""; if (end != -1 && start != -1) { charStr = strData.substring(start + 2, end); //判断进制 char s = charStr.charAt(0); if(s=='x' || s=='X'){ system = 16; charStr = charStr.substring(1); } //转换 try{ char letter = (char) Integer.parseInt(charStr,system); buffer.append(new Character(letter).toString()); }catch(NumberFormatException e){ e.printStackTrace(); } } //处理当前unicode字符到下一个unicode字符之间的非unicode字符 start = strData.indexOf("&#",end); if(start-end>1){ buffer.append(strData.substring(end+1, start)); } //处理最后面的非unicode字符 if(start==-1){ int length = strData.length(); if(end+1!=length){ buffer.append(strData.substring(end+1,length)); } } } return buffer.toString(); } public static String replace(String strSource, String strFrom, String strTo) { if (strSource == null) { return null; } int i =0; if((i = strSource.indexOf(strFrom, i))>= 0) { char[] cSrc = strSource.toCharArray(); char[] cTo = strTo.toCharArray(); int len = strFrom.length(); StringBuffer buf = new StringBuffer(cSrc.length); buf.append(cSrc,0,i).append(cTo); i +=len; int j= i; while((i = strSource.indexOf(strFrom,i))> 0){ buf.append(cSrc,j, i-j).append(cTo); i += len; j = i; } buf.append(cSrc,j,cSrc.length-j); return buf.toString(); } return strSource; }

你可能感兴趣的:(Java)