今天在项目中需要对AS400(IBM商用小型主机)主机档案进行转档操作,涉及到的主要是UTF-8(本地文件编码格式)与代码页(CodePage 937)之间的转换。
1. 读取本地文件srcFile的内容,每行补齐2048位(每行不会超过2048位,不足的以空格补齐)
2. 如果包含中文时,中文算作两位,并且考虑到400的机器,中文前后有0x0E,0x0F(shift out,shift in)控制位,即再加两位,例:“中文”算作6位
以下是JAVA代码:
/** * $Revision: 1.0 $ * $Date: Oct 14, 2010 $ * * Author: Ian Chan * Date : Oct 14, 2010 */ package com.test.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.test.convert.Converter; /** * @author Ian Chan * @version 1.0 */ public class ConverterHelper { private static final Log logger = LogFactory.getLog(ConverterHelper.class); private static final int LINE_LEN = 2048; private static final String SPACE = " "; public static String buildContent(File file) { StringBuffer buffer = new StringBuffer(); InputStream is = null; BufferedReader reader; try { is = new FileInputStream(file); reader = new BufferedReader(new InputStreamReader(is)); String line; while ((line = reader.readLine()) != null) { if (line != null && line.length() < LINE_LEN) { StrLength sl = new ConverterHelper().new StrLength(line, line.length());//InnerClass調用 handleLine(sl); line = sl.line; int length = sl.length; int remain = LINE_LEN - length; for (int i = 0; i < remain; i++) { line += SPACE;//補齊空格 } buffer.append(line); System.out.println("len:" + line.length() + ",line:" + line); } } } catch (IOException e) { logger.error(e.getMessage(), e); } finally { if (is != null) try { is.close(); } catch (IOException e) { logger.error(e.getMessage(), e); } } return buffer.toString(); } private static void handleLine(StrLength strLength) throws UnsupportedEncodingException { String regex = "([\\u4e00-\\u9fa5]|[ ])+";//判斷中文的正則表達式(包括中文全角空格) Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(strLength.line); if (!matcher.find()) return; StringBuffer buffer = new StringBuffer(); int lineLength = 0; boolean exist = true; int index = 0; while (exist) { int start = matcher.start();//匹配第一個 int end = matcher.end();//匹配最後一個 String beforeStart = strLength.line.substring(index, start); buffer.append(beforeStart); lineLength += beforeStart.length(); String cnchars = strLength.line.substring(start, end); buffer.append(cnchars); lineLength += cnchars.length() * 2 + 2;// 中文字算兩位,前後控制符(0E,0F)各算一位 index = end; exist = matcher.find(end + 1);//查找下一個中文 } String remainLine = strLength.line.substring(index); buffer.append(remainLine); lineLength += remainLine.length(); strLength.line = buffer.toString(); strLength.length = lineLength; } public static void convert(String srcFileName, String destFileName) { try { String as400content = Converter.as400Encode(buildContent(new File(srcFileName))); FileUtils.writeStringToFile(new File(destFileName), as400content, Converter.ENCODING); System.out.println(as400content); } catch (IOException e) { logger.error(e.getMessage(), e); } } class StrLength { private String line; private int length; private StrLength(String line, int length) { this.line = line; this.length = length; } public String getLine() { return line; } public void setLine(String line) { this.line = line; } public int getLength() { return length; } public void setLength(int length) { this.length = length; } } public static void main(String[] args) throws Exception { String srcFileName = "F:\\AS400_CN"; String destFileName = "F:\\AS400_CN_CP937"; convert(srcFileName, destFileName); System.out.println("Convert Finished"); } }
对于代码页(CodePage)的内容,参考如下:
Conversion between any of the following codepages is provided. 37 (=x0025) EBCDIC US English 273 (=x0111) EBCDIC German 277 (=x0115) EBCDIC Danish/Norwegian 278 (=x0116) EBCDIC Finnish/Swedish 280 (=x0118) EBCDIC Italian 284 (=x011C) EBCDIC Spanish 285 (=x011D) EBCDIC UK English 297 (=x0129) EBCDIC French 300 (=x012C) EBCDIC Japanese DBCS 301 (=x012D) Japanese PC DBCS 420 (=x01A4) EBCDIC Arabic 424 (=x01A8) EBCDIC Arabic 437 (=x01B5) PC-ASCII US 500 (=x01F4) EBCDIC International 803 (=x0323) Hebrew Set A 813 (=x032D) ISO8859-7 Greek 819 (=x0333) ISO8859-1 Western European 833 (=x0341) IBM-833: Korean 834 (=x0342) IBM-834: Korean Host DBCS 835 (=x0343) EBCDIC Traditional Chinese DBCS 836 (=x0344) EBCDIC Simplified Chinese SBCS 838 (=x0346) EBCDIC Thai SBCS 850 (=x0352) ISO8859-1 Western European 852 (=x0354) PC-ASCII Eastern European 855 (=x0357) PC-ASCII Cyrillic 856 (=x0358) PC-ASCII Hebrew 857 (=x0359) PC-ASCII Turkish 858 (=x035A) PC-ASCII Western European with Euro 860 (=x035C) PC-ASCII Portuguese 861 (=x035D) PC-ASCII Icelandic 862 (=x035E) PC-ASCII Hebrew 863 (=x035F) PC-ASCII Canadian French 864 (=x0360) PC-ASCII Arabic 865 (=x0361) PC-ASCII Scandinavian 866 (=x0362) PC-ASCII Cyrillic #2 868 (=x0364) PC-ASCII Urdu 869 (=x0365) PC-ASCII Greek 870 (=x0366) EBCDIC Eastern Europe 871 (=x0367) EBCDIC Icelandic 872 (=x0368) PC-ASCII Cyrillic with Euro 874 (=x036A) PC-ASCII Thai SBCS 875 (=x036B) EBCDIC Greek 880 (=x0370) EBCDIC Cyrillic 891 (=x037B) IBM-891: Korean 897 (=x0381) PC-ASCII Japan Data SBCS 903 (=x0387) PC Simplified Chinese SBCS 904 (=x0388) PC Traditional Chinese Data - SBCS 912 (=x0390) ISO8859-2 Eastern European 915 (=x0393) ISO8859-5 Cyrillic 916 (=x0394) ISO8859-8 Hebrew 918 (=x0396) EBCDIC Urdu 920 (=x0398) ISO8859-9 Turkish 921 (=x0399) ISO Baltic 922 (=x039A) ISO Estonian 923 (=x039B) ISO8859-15 Western Europe with euro (Latin 9) 924 (=x039C) EBCDIC Western Europe with euro 927 (=x039F) PC Traditional Chinese DBCS 928 (=x03A0) PC Simplified Chinese DBCS 930 (=x03A2) EBCDIC Japanese Katakana/Kanji mixed 932 (=x03A4) Japanese OS/2 933 (=x03A5) EBCDIC Korean Mixed 935 (=x03A7) EBCDIC Simplified Chinese Mixed 937 (=x03A9) EBCDIC Traditional Chinese Mixed 939 (=x03AB) EBCDIC Japanese Latin/Kanji mixed 941 (=x03AD) Japanese PC DBCS - for open systems 942 (=x03AE) Japanese PC Data Mixed - extended SBCS 943 (=x03AF) Japanese PC Mixed - for open systems 944 (=x03BO) Korean PC data Mixed - extended SBCS 946 (=x03B2) Simplified Chinese PC data Mixed - extended SBCS 947 (=x03B3) PC Traditional Chinese DBCS 948 (=x03B4) PC Traditional Chinese Mixed - extended SBCS 949 (=x03B5) PC Korean Mixed - KS code 950 (=x03B6) PC Traditional Chinese Mixed - big5 951 (=x03B7) PC Korean DBCS - KS code 970 (=x03CA) euc Korean 1004 (=x03EC) PC Data Latin1 1006 (=x03EE) ISO Urdu 1008 (=x03F0) ASCII Arabic 8-bit ISO 1025 (=x0401) EBCDIC Cyrillic 1026 (=x0402) EBCDIC Turkish 1027 (=x0403) EBCDIC Japanese Latin 1040 (=x0410) IBM-1040: Korean 1041 (=x0411) Japanese PC - extended SBCS 1042 (=x0412) PC Simplified Chinese - extended SBCS 1043 (=x0413) PC Traditional Chinese - extended SBCS 1046 (=x0416) PC-ASCII Arabic 1047 (=x0417) IBM-1047: Western European 1051 (=x041B) ASCII roman8 for HP Western European 1088 (=x0440) PC Korean SBCS - KS code 1089 (=x0441) ISO8859-6 Arabic 1097 (=x0449) EBCDIC Farsi 1098 (=x044A) PC-ASCII Farsi 1112 (=x0458) EBCDIC Baltic (Latvian/Lithuanian) 1114 (=x045A) PC Traditional Chinese - big 5 SBCS 1115 (=x045B) PC Simplified Chinese SBCS 1122 (=x0462) EBCDIC Estonian 1123 (=x0463) EBCDIC Ukrainian 1124 (=x0464) UNIX-ASCII Ukrainian 1131 (=x046B) PC-ASCII Belarus 1140 (=x0474) EBCDIC USA, with euro (like 037) 1141 (=x0475) EBCDIC Austria, Germany, with euro (like 273) 1142 (=x0476) EBCDIC Denmark, Norway, with euro (like 277) 1143 (=x0477) EBCDIC Finland, Sweden, with euro (like 278) 1144 (=x0478) EBCDIC Italy, with euro (like 280) 1145 (=x0479) EBCDIC Spain, with euro (like 284) 1146 (=x047A) EBCDIC UK, with euro (like 285) 1147 (=x047B) EBCDIC France, with euro (like 297) 1148 (=x047C) EBCDIC International, with euro (like 500) 1149 (=x047D) EBCDIC Iceland, with euro (like 871) 1200 (=x04B0) Unicode - UCS-2 1208 (=x04B8) Unicode - UTF-8 1250 (=x04E2) Windows - Eastern European 1251 (=x04E3) Windows - Cyrillic 1252 (=x04E4) Windows - Western European 1253 (=x04E5) Windows - Greek 1254 (=x04E6) Windows - Turkish 1255 (=x04E7) Windows - Hebrew 1256 (=x04E8) Windows - Arabic 1257 (=x04E9) Windows - Baltic Rim 1275 (=x04FB) Apple - Western European 1280 (=x0500) Apple - Greek 1281 (=x0501) Apple - Turkish 1282 (=x0502) Apple - Eastern European 1283 (=x0503) Apple - Cyrillic 1284 (=x0504) IBM-504: Eastern European 1285 (=x0505) IBM-505: Eastern European 1363 (=x0553) Windows Korean PC Mixed including 11,172 full hangul 1364 (=x0554) Korean Host Mixed extended including 11,172 full hangul 1380 (=x0564) PC Simplified Chinese DBCS 1381 (=x0565) PC Simplified Chinese Mixed 1383 (=x0567) euc Simplified Chinese Mixed 1386 (=x056A) PC Simplified Chinese Data GBK Mixed 1388 (=x056C) DBCS Host Simplified Chinese Data GBK Mixed 5346 (=x14E2) Windows-Eastern European with Euro (like 1250) 5347 (=x14E3) Windows - Cyrillic with Euro (like 1251) 5348 (=x14E4) Windows-Western European with Euro (like 1252) 5349 (=x14E5) Windows-Windows - Greek with Euro (like 1253) 5350 (=x14E6) Windows - Turkish with Euro (like 1254) 5351 (=x14E7) Windows - Hebrew with Euro (like 1255) 5352 (=x14E8) Windows - Arabic with Euro (like 1256) 5353 (=x14E9) Windows - Baltic Rim with Euro (like 1257) 5354 (=x14EA) 'Windows - Vietnamese with Euro (like 1258)
-- 2010-10-22更新
今天测试又有点问题,原来是中文全角空格的问题,中文全角空格不包括在"\\u4e00-\\u9fa5"内,所以把检验中文的正则表达式改成了
([\\u4e00-\\u9fa5]|[ ])+