处理Oracle xmltype字段类型不兼容的特殊字符

1、调用方法removeInvalidXMLCharacter过滤xmltype字段类型不兼容的特殊字符

unicode增补区字符 [#x10000-#x10FFFF],这区间的字符需过滤掉,不保存到xmltype类型的字段中

       /**
       * <pre>
       * 删掉Oracle xmltype字段类型不兼容的特殊字符
       * 参考资料:
       * <a href="http://stackoverflow.com/questions/4237625/removing-invalid-xml-characters-from-a-string-in-java">http://stackoverflow.com/questions/4237625/removing-invalid-xml-characters-from-a-string-in-java</a>
       * <a href="http://www.oracle.com/technetwork/articles/javase/supplementary-142654.html">http://www.oracle.com/technetwork/articles/javase/supplementary-142654.html</a>
       * <a href="http://www.w3.org/TR/REC-xml/#charsets">http://www.w3.org/TR/REC-xml/#charsets</a>
       * <a href="http://en.wikipedia.org/wiki/Unicode">http://en.wikipedia.org/wiki/Unicode</a>
       * <a href="http://www.unicode.org/Public/UCD/latest/charts/CodeCharts.pdf">http://www.unicode.org/Public/UCD/latest/charts/CodeCharts.pdf</a>
       * </pre>
       * @param text
       * @return
       * @author wdd 2014-12-8
       */
    public static String removeInvalidXMLCharacter(String text) {
          
          if (null == text || text.isEmpty()) {
              return "";
          }
        
//          output("text", text);
//          outputBytes(text);
//          outputCharSequence(text);

          try {//删除UTF-8 BOM(字节顺序标记(ByteOrderMark) 0xEFBBBF)
            byte[] srcBytes = text.getBytes();
            //(byte)0xef, (byte)0xbb, (byte)0xbf  --->  -17 -69 -65 
            if (srcBytes.length>=3 && srcBytes[0] == -17 && srcBytes[1] == -69 && srcBytes[2] == -65) {
                byte[] destBytes = new byte[srcBytes.length - 3];
                System.arraycopy(srcBytes, 3, destBytes, 0, destBytes.length);
                text = new String(destBytes, "UTF-8");
            }
          } catch (UnsupportedEncodingException e) {
             e.printStackTrace();
          }
          
          final int len = text.length();
//          output("length", len);
          
          char current = 0;
          int codePoint = 0;
          StringBuilder sb = new StringBuilder();
          for (int i = 0; i < len; i++) {
              current = text.charAt(i);
//              output("current", current);
              
              if(escapeXmlEntity(text, len, i, current, sb)){
//                  output("continue","");
                  continue;
              }
              
              /*boolean surrogate = false;//codePoint范围在[#x10000-#x10FFFF]则为true
              if (Character.isHighSurrogate(current)
                      && i + 1 < len && Character.isLowSurrogate(text.charAt(i + 1))) {
                  surrogate = true;
                  codePoint = text.codePointAt(i++);
                  output("LowSurrogate:", text.charAt(i));
              } else {
                  codePoint = current;
              }

              if (isValidXMLCharacter(codePoint)) {
                  sb.append(current);
                  if (surrogate) {
                      sb.append(text.charAt(i));
                  }
              }*/
              
              if(Character.isHighSurrogate(current) || Character.isLowSurrogate(current)){
//                  output("Surrogate", current);
                  continue;
              }else{
                  codePoint = current;
              }
             
              if(isValidXMLCharacter(codePoint)){//判断是否有效字符
                  sb.append(current);
              }
              
//              output("codePoint", codePoint);
          }
          
//          output("text", sb.toString());
//          outputBytes(sb.toString());
//          outputCharSequence(sb.toString());

          return sb.toString();
        }

    /**
     * @param text
     * @param len = text.length()
     * @param i
     * @param current = text.charAt(i)
     * @param sb
     * @return 返回ture表示:text中当前索引i所表示的字符“&”在sb中存储为“&amp;amp;”,否则返回false
     * @author wdd 2014-12-8
     */
    private static boolean escapeXmlEntity(String text, final int len, int i,
            char current, StringBuilder sb) {
        if (current == '&') {
            boolean isAllow = false;
            for (String str : ALLOW_STR) {
                if (i + str.length() > len) {
                    continue;
                }

                int j = 0;
                for (; j < str.length(); j++) {
                    if (text.charAt(i + j) != str.charAt(j)) {
                        break;
                    }
                }

                if (j == str.length()) {
                    isAllow = true;
                    break;
                }
            }
            if (!isAllow) {
                sb.append("&amp;");
                return true;
            }
        }
        
        return false;
    }

    
    /**<pre>
     *http://www.w3.org/TR/REC-xml/#charsets
     *http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf
     *http://www.unicode.org/reports/tr36/
Character Range: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | <del>[#x10000-#x10FFFF]</del>

Document authors are encouraged to avoid "compatibility characters", as defined in section 2.3 of [Unicode]. The characters defined in the following ranges are also discouraged. They are either control characters or permanently undefined Unicode characters:
[#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDEF],
<del>[#x1FFFE-#x1FFFF], [#x2FFFE-#x2FFFF], [#x3FFFE-#x3FFFF],
[#x4FFFE-#x4FFFF], [#x5FFFE-#x5FFFF], [#x6FFFE-#x6FFFF],
[#x7FFFE-#x7FFFF], [#x8FFFE-#x8FFFF], [#x9FFFE-#x9FFFF],
[#xAFFFE-#xAFFFF], [#xBFFFE-#xBFFFF], [#xCFFFE-#xCFFFF],
[#xDFFFE-#xDFFFF], [#xEFFFE-#xEFFFF], [#xFFFFE-#xFFFFF],
[#x10FFFE-#x10FFFF]</del>

"[\u007f-\u0084\u0086-\u009f\ufdd0-\ufdef" +
"\ud83f\udffe-\ud83f\udfff" + "\ud87f\udffe-\ud87f\udfff" + "\ud8bf\udffe-\ud8bf\udfff" + "\ud8ff\udffe-\ud8ff\udfff" +
"\ud93f\udffe-\ud93f\udfff" + "\ud97f\udffe-\ud97f\udfff" + "\ud9bf\udffe-\ud9bf\udfff" + "\ud9ff\udffe-\ud9ff\udfff" +
"\uda3f\udffe-\uda3f\udfff" + "\uda7f\udffe-\uda7f\udfff" + "\udabf\udffe-\udabf\udfff" + "\udaff\udffe-\udaff\udfff" +
"\udb3f\udffe-\udb3f\udfff" + "\udb7f\udffe-\udb7f\udfff" + "\udbbf\udffe-\udbbf\udfff" + "\udbff\udffe-\udbff\udfff]";

</pre>
     * @param codePoint
     * @return
     * @author wdd 2014-12-8
     */
    public static boolean isValidXMLCharacter(int codePoint) {
        boolean a = (codePoint == 0x9) || (codePoint == 0xA) || (codePoint == 0xD)
                  || ((codePoint >= 0x20) && (codePoint <= 0xD7FF))
                  || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD))
                  /*|| ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))*/;
        if(false == a){
            return false;
        }else{
            boolean b = ((codePoint >= 0x7F) && (codePoint <= 0x84)) || ((codePoint >= 0x86) && (codePoint <= 0x9F)) ||((codePoint >= 0xFDD0) && (codePoint <= 0xFDEF))
                    /*|| codePoint == 0x1FFFE || codePoint == 0x1FFFF
                    || codePoint == 0x2FFFE || codePoint == 0x2FFFF
                    || codePoint == 0x3FFFE || codePoint == 0x3FFFF
                    || codePoint == 0x4FFFE || codePoint == 0x4FFFF
                    || codePoint == 0x5FFFE || codePoint == 0x5FFFF
                    || codePoint == 0x6FFFE || codePoint == 0x6FFFF
                    || codePoint == 0x7FFFE || codePoint == 0x7FFFF
                    || codePoint == 0x8FFFE || codePoint == 0x8FFFF
                    || codePoint == 0x9FFFE || codePoint == 0x9FFFF
                    || codePoint == 0xAFFFE || codePoint == 0xAFFFF
                    || codePoint == 0xBFFFE || codePoint == 0xBFFFF
                    || codePoint == 0xCFFFE || codePoint == 0xCFFFF
                    || codePoint == 0xDFFFE || codePoint == 0xDFFFF
                    || codePoint == 0xEFFFE || codePoint == 0xEFFFF
                    || codePoint == 0xFFFFE || codePoint == 0xFFFFF
                    || codePoint ==0x10FFFE || codePoint ==0x10FFFF*/;
            return !b;
        }
    }
          
        private static void output(String logtype, Object s){
            int codePoint = 0;
            if(Character.class.equals(s.getClass())){
                codePoint = ((Character)s).charValue();
                System.out.println(logtype+":\t"+codePoint+"\t"+Integer.toHexString(codePoint)+"\t"+Integer.toBinaryString(codePoint));
            }else if(Integer.class.equals(s.getClass())){
                codePoint = (Integer)s;
                System.out.println(logtype+":\t"+codePoint+"\t"+Integer.toHexString(codePoint)+"\t"+Integer.toBinaryString(codePoint));
            }else{
                System.out.println(logtype+":\t"+s);
            }
        }
        
        private static void outputBytes(byte[] nb) {
            for(int i=0;i<nb.length;i++){
                System.out.println("outputBytes:"+nb[i]+"\t"+Integer.toHexString(nb[i])+"\t"+Integer.toBinaryString(nb[i]));
            }
        }

        private static void outputBytes(String utf) {
            byte[] nb = utf.getBytes();
            for(int i=0;i<nb.length;i++){
                System.out.println("outputBytes:"+nb[i]+"\t"+Integer.toHexString(nb[i])+"\t"+Integer.toBinaryString(nb[i]));
            }
        }

        private static void outputCharSequence(String utf) {
            CharSequence charSeq = utf;
            for(int i=0;i<charSeq.length();i++){
                System.out.println("outputCharSequence:"+Integer.toHexString(charSeq.charAt(i))+"\t"+Integer.toBinaryString(charSeq.charAt(i)));
            }
        }


2、通过存储过程TEST_ERROR_PRC查找xmltype中造成无法解析的字符

create table TEST_TABLE
(
  id NUMBER,
  c_xml xmltype
);

create table TEST_ERROR
(
  id NUMBER,
  code     NUMBER,
  errm     VARCHAR2(4000)
);
 
create or replace procedure TEST_ERROR_PRC is
begin
  declare
    /*
    ORA-29913: error in executing ODCIEXTTABLEPOPULATE callout
    ORA-31061: XDB error: XML event error
    ORA-19202: Error occurred in XML processing
    ORA-31011: XML 语法分析失败
    */
    e_29913 exception;
    pragma exception_init(e_29913, -29913);
    e_31061 exception;
    pragma exception_init(e_31061, -31061);
    e_19202 exception;
    pragma exception_init(e_19202, -19202);
    e_31011 exception;
    pragma exception_init(e_31011, -31011);
  
    x         xmltype;
    v_code    number;
    v_sqlerrm varchar2(4000);
    v_count number :=0;
  begin
    for v in (select id from TEST_TABLE where c_xml is not null order by id)
       loop
      begin
        select xmltype(t.c_xml.getclobval())
          into x
          from TEST_TABLE t
         where t.id = v.id;
        --dbms_output.put_line('--'||v.id);
        v_count := v_count+1;
      exception
        when others then
          begin
            --dbms_output.put_line(v.id|| ':' ||sqlcode||':'||sqlerrm);
            v_code    := sqlcode;
            v_sqlerrm := sqlerrm;
            insert into TEST_ERROR
              (id, code, errm)
            values
              (v.id, v_code, v_sqlerrm);
            commit;
          end;
      end;
    end loop;
    
  end;
end;


更多技术文章,欢迎访问我的个人博客 http://www.wangdandong.com/

你可能感兴趣的:(java,oracle,unicode,xmltype)