1、调用方法removeInvalidXMLCharacter过滤xmltype字段类型不兼容的特殊字符
unicode增补区字符 [#x10000-#x10FFFF],这区间的字符需过滤掉,不保存到xmltype类型的字段中
/** * <pre> * 删掉Oracle xmltype字段类型不兼容的特殊字符 * 参考资料: * <a href="http://stackoverflow.com/questions/4237625/removing-invalid-xml-characters-from-a-string-in-java">http://stackoverflow.com/questions/4237625/removing-invalid-xml-characters-from-a-string-in-java</a> * <a href="http://www.oracle.com/technetwork/articles/javase/supplementary-142654.html">http://www.oracle.com/technetwork/articles/javase/supplementary-142654.html</a> * <a href="http://www.w3.org/TR/REC-xml/#charsets">http://www.w3.org/TR/REC-xml/#charsets</a> * <a href="http://en.wikipedia.org/wiki/Unicode">http://en.wikipedia.org/wiki/Unicode</a> * <a href="http://www.unicode.org/Public/UCD/latest/charts/CodeCharts.pdf">http://www.unicode.org/Public/UCD/latest/charts/CodeCharts.pdf</a> * </pre> * @param text * @return * @author wdd 2014-12-8 */ public static String removeInvalidXMLCharacter(String text) { if (null == text || text.isEmpty()) { return ""; } // output("text", text); // outputBytes(text); // outputCharSequence(text); try {//删除UTF-8 BOM(字节顺序标记(ByteOrderMark) 0xEFBBBF) byte[] srcBytes = text.getBytes(); //(byte)0xef, (byte)0xbb, (byte)0xbf ---> -17 -69 -65 if (srcBytes.length>=3 && srcBytes[0] == -17 && srcBytes[1] == -69 && srcBytes[2] == -65) { byte[] destBytes = new byte[srcBytes.length - 3]; System.arraycopy(srcBytes, 3, destBytes, 0, destBytes.length); text = new String(destBytes, "UTF-8"); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } final int len = text.length(); // output("length", len); char current = 0; int codePoint = 0; StringBuilder sb = new StringBuilder(); for (int i = 0; i < len; i++) { current = text.charAt(i); // output("current", current); if(escapeXmlEntity(text, len, i, current, sb)){ // output("continue",""); continue; } /*boolean surrogate = false;//codePoint范围在[#x10000-#x10FFFF]则为true if (Character.isHighSurrogate(current) && i + 1 < len && Character.isLowSurrogate(text.charAt(i + 1))) { surrogate = true; codePoint = text.codePointAt(i++); output("LowSurrogate:", text.charAt(i)); } else { codePoint = current; } if (isValidXMLCharacter(codePoint)) { sb.append(current); if (surrogate) { sb.append(text.charAt(i)); } }*/ if(Character.isHighSurrogate(current) || Character.isLowSurrogate(current)){ // output("Surrogate", current); continue; }else{ codePoint = current; } if(isValidXMLCharacter(codePoint)){//判断是否有效字符 sb.append(current); } // output("codePoint", codePoint); } // output("text", sb.toString()); // outputBytes(sb.toString()); // outputCharSequence(sb.toString()); return sb.toString(); } /** * @param text * @param len = text.length() * @param i * @param current = text.charAt(i) * @param sb * @return 返回ture表示:text中当前索引i所表示的字符“&”在sb中存储为“&amp;”,否则返回false * @author wdd 2014-12-8 */ private static boolean escapeXmlEntity(String text, final int len, int i, char current, StringBuilder sb) { if (current == '&') { boolean isAllow = false; for (String str : ALLOW_STR) { if (i + str.length() > len) { continue; } int j = 0; for (; j < str.length(); j++) { if (text.charAt(i + j) != str.charAt(j)) { break; } } if (j == str.length()) { isAllow = true; break; } } if (!isAllow) { sb.append("&"); return true; } } return false; } /**<pre> *http://www.w3.org/TR/REC-xml/#charsets *http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf *http://www.unicode.org/reports/tr36/ Character Range: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | <del>[#x10000-#x10FFFF]</del> Document authors are encouraged to avoid "compatibility characters", as defined in section 2.3 of [Unicode]. The characters defined in the following ranges are also discouraged. They are either control characters or permanently undefined Unicode characters: [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDEF], <del>[#x1FFFE-#x1FFFF], [#x2FFFE-#x2FFFF], [#x3FFFE-#x3FFFF], [#x4FFFE-#x4FFFF], [#x5FFFE-#x5FFFF], [#x6FFFE-#x6FFFF], [#x7FFFE-#x7FFFF], [#x8FFFE-#x8FFFF], [#x9FFFE-#x9FFFF], [#xAFFFE-#xAFFFF], [#xBFFFE-#xBFFFF], [#xCFFFE-#xCFFFF], [#xDFFFE-#xDFFFF], [#xEFFFE-#xEFFFF], [#xFFFFE-#xFFFFF], [#x10FFFE-#x10FFFF]</del> "[\u007f-\u0084\u0086-\u009f\ufdd0-\ufdef" + "\ud83f\udffe-\ud83f\udfff" + "\ud87f\udffe-\ud87f\udfff" + "\ud8bf\udffe-\ud8bf\udfff" + "\ud8ff\udffe-\ud8ff\udfff" + "\ud93f\udffe-\ud93f\udfff" + "\ud97f\udffe-\ud97f\udfff" + "\ud9bf\udffe-\ud9bf\udfff" + "\ud9ff\udffe-\ud9ff\udfff" + "\uda3f\udffe-\uda3f\udfff" + "\uda7f\udffe-\uda7f\udfff" + "\udabf\udffe-\udabf\udfff" + "\udaff\udffe-\udaff\udfff" + "\udb3f\udffe-\udb3f\udfff" + "\udb7f\udffe-\udb7f\udfff" + "\udbbf\udffe-\udbbf\udfff" + "\udbff\udffe-\udbff\udfff]"; </pre> * @param codePoint * @return * @author wdd 2014-12-8 */ public static boolean isValidXMLCharacter(int codePoint) { boolean a = (codePoint == 0x9) || (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) /*|| ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))*/; if(false == a){ return false; }else{ boolean b = ((codePoint >= 0x7F) && (codePoint <= 0x84)) || ((codePoint >= 0x86) && (codePoint <= 0x9F)) ||((codePoint >= 0xFDD0) && (codePoint <= 0xFDEF)) /*|| codePoint == 0x1FFFE || codePoint == 0x1FFFF || codePoint == 0x2FFFE || codePoint == 0x2FFFF || codePoint == 0x3FFFE || codePoint == 0x3FFFF || codePoint == 0x4FFFE || codePoint == 0x4FFFF || codePoint == 0x5FFFE || codePoint == 0x5FFFF || codePoint == 0x6FFFE || codePoint == 0x6FFFF || codePoint == 0x7FFFE || codePoint == 0x7FFFF || codePoint == 0x8FFFE || codePoint == 0x8FFFF || codePoint == 0x9FFFE || codePoint == 0x9FFFF || codePoint == 0xAFFFE || codePoint == 0xAFFFF || codePoint == 0xBFFFE || codePoint == 0xBFFFF || codePoint == 0xCFFFE || codePoint == 0xCFFFF || codePoint == 0xDFFFE || codePoint == 0xDFFFF || codePoint == 0xEFFFE || codePoint == 0xEFFFF || codePoint == 0xFFFFE || codePoint == 0xFFFFF || codePoint ==0x10FFFE || codePoint ==0x10FFFF*/; return !b; } } private static void output(String logtype, Object s){ int codePoint = 0; if(Character.class.equals(s.getClass())){ codePoint = ((Character)s).charValue(); System.out.println(logtype+":\t"+codePoint+"\t"+Integer.toHexString(codePoint)+"\t"+Integer.toBinaryString(codePoint)); }else if(Integer.class.equals(s.getClass())){ codePoint = (Integer)s; System.out.println(logtype+":\t"+codePoint+"\t"+Integer.toHexString(codePoint)+"\t"+Integer.toBinaryString(codePoint)); }else{ System.out.println(logtype+":\t"+s); } } private static void outputBytes(byte[] nb) { for(int i=0;i<nb.length;i++){ System.out.println("outputBytes:"+nb[i]+"\t"+Integer.toHexString(nb[i])+"\t"+Integer.toBinaryString(nb[i])); } } private static void outputBytes(String utf) { byte[] nb = utf.getBytes(); for(int i=0;i<nb.length;i++){ System.out.println("outputBytes:"+nb[i]+"\t"+Integer.toHexString(nb[i])+"\t"+Integer.toBinaryString(nb[i])); } } private static void outputCharSequence(String utf) { CharSequence charSeq = utf; for(int i=0;i<charSeq.length();i++){ System.out.println("outputCharSequence:"+Integer.toHexString(charSeq.charAt(i))+"\t"+Integer.toBinaryString(charSeq.charAt(i))); } }
2、通过存储过程TEST_ERROR_PRC查找xmltype中造成无法解析的字符
create table TEST_TABLE ( id NUMBER, c_xml xmltype ); create table TEST_ERROR ( id NUMBER, code NUMBER, errm VARCHAR2(4000) ); create or replace procedure TEST_ERROR_PRC is begin declare /* ORA-29913: error in executing ODCIEXTTABLEPOPULATE callout ORA-31061: XDB error: XML event error ORA-19202: Error occurred in XML processing ORA-31011: XML 语法分析失败 */ e_29913 exception; pragma exception_init(e_29913, -29913); e_31061 exception; pragma exception_init(e_31061, -31061); e_19202 exception; pragma exception_init(e_19202, -19202); e_31011 exception; pragma exception_init(e_31011, -31011); x xmltype; v_code number; v_sqlerrm varchar2(4000); v_count number :=0; begin for v in (select id from TEST_TABLE where c_xml is not null order by id) loop begin select xmltype(t.c_xml.getclobval()) into x from TEST_TABLE t where t.id = v.id; --dbms_output.put_line('--'||v.id); v_count := v_count+1; exception when others then begin --dbms_output.put_line(v.id|| ':' ||sqlcode||':'||sqlerrm); v_code := sqlcode; v_sqlerrm := sqlerrm; insert into TEST_ERROR (id, code, errm) values (v.id, v_code, v_sqlerrm); commit; end; end; end loop; end; end;
更多技术文章,欢迎访问我的个人博客 http://www.wangdandong.com/