Java处理UTF-8带BOM的文本的读写

什么是BOM

BOM(byte-order mark),即字节顺序标记,它是插入到以UTF-8、UTF16或UTF-32编码Unicode文件开头的特殊标记,用来识别Unicode文件的编码类型。对于UTF-8来说,BOM并不是必须的,因为BOM用来标记多字节编码文件的编码类型和字节顺序(big-endian或little- endian)。

BOMs 文件头:
   00 00 FE FF    = UTF-32, big-endian
   FF FE 00 00    = UTF-32, little-endian
   EF BB BF       = UTF-8,
   FE FF          = UTF-16, big-endian
   FF FE          = UTF-16, little-endian



下面举个例子,针对UTF-8的文件BOM做个处理:

String xmla = StringFileToolkit.file2String(new File(“D:\\projects\\mailpost\\src\\a.xml”),“UTF-8”);

byte[] b = xmla.getBytes(“UTF-8”);

String xml = new String(b,3,b.length-3,“UTF-8”);

..............

思路是:先按照UTF-8编码读取文件后,跳过前三个字符,重新构建一个新的字符串,然后用Dom4j解析处理,这样就不会报错了。

其他编码的方式处理思路类似,其实可以写一个通用的自动识别的BOM的工具,去掉BOM信息,返回字符串。

不过这个处理过程已经有牛人解决过了:http://koti.mbnet.fi/akini/java/unicodereader/

‍Example code using UnicodeReader class
Here is an example method to read text file. It will recognize bom marker and skip it while reading. 

//import ‍http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt
   public static char[] loadFile(String file) throws IOException {
      // read text file, auto recognize bom marker or use 
      // system default if markers not found.
      BufferedReader reader = null;
      CharArrayWriter writer = null;
      UnicodeReader r = new UnicodeReader(new FileInputStream(file), null);
  
      char[] buffer = new char[16 * 1024];   // 16k buffer
      int read;
      try {
         reader = new BufferedReader(r);
         writer = new CharArrayWriter();
         while( (read = reader.read(buffer)) != -1) {
            writer.write(buffer, 0, read);
         }
         writer.flush();
         return writer.toCharArray();
      } catch (IOException ex) {
         throw ex;
      } finally {
         try {
            writer.close(); reader.close(); r.close();
         } catch (Exception ex) { }
      }
   }


Example code to write UTF-8 with bom marker
Write bom marker bytes to start of empty file and all proper text editors have no problems using a correct charset while reading files. Java's OutputStreamWriter does not write utf8 bom marker bytes. 


   public static void saveFile(String file, String data, boolean append) throws IOException {
      BufferedWriter bw = null;
      OutputStreamWriter osw = null;
  
      File f = new File(file);
      FileOutputStream fos = new FileOutputStream(f, append);
      try {
         // write UTF8 BOM mark if file is empty
         if (f.length() < 1) {
           final byte[] bom = new byte[] { (byte)0xEF, (byte)0xBB, (byte)0xBF };
            fos.write(bom);
         }

         osw = new OutputStreamWriter(fos, "UTF-8");
         bw = new BufferedWriter(osw);
         if (data != null) bw.write(data);
      } catch (IOException ex) {
         throw ex;
      } finally {
         try { bw.close(); fos.close(); } catch (Exception ex) { }
      }
   }
 



实际应用:
package com.dayo.gerber;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.util.Properties;

/**
 * 
 * @author 刘飞(liufei)
 * 
 */
public class Generate4YYQTPScript {

	private static final String ENCODING = "UTF-8";
	private static final String GERBER_CONFIG = "config/gerber4yy.properties";

	private static Properties GERBER_CONFIG_PROPS = null;
	private static final String GERBER_FORMAT_DIALOG_TITLE_SCRIPT = "{#GERBER_FORMAT_DIALOG_TITLE}";
	private static String GERBER_FORMAT_DIALOG_TITLE = "";

	/* gerber properties parmters keys config */
	private static final String QTP_SCRIPT_IN = "script.in";

	private static final String QTP_SCRIPT_OUT = "script.out";

	private static final String QTP_SYSTEM_PATH = "QTP.system.path";
	private static final String QTP_SYSTEM_PATH_SCRIPT = "{#QTPSYSTEMPATH}";

	private static final String GERBER_FILE_DRIVER_PATH = "gerber.file.driver.path";
	private static final String GERBER_FILE_DRIVER_PATH_SCRIPT = "{#driver}";

	private static final String GERBER_FILE_DRIVER = "gerber.file.driver";
	private static final String GERBER_FILE_DRIVER_SCRIPT = "{#dr}";

	private static final String GERBER_FILE_DIR = "gerber.file.dir";
	private static final String GERBER_FILE_DIR_SCRIPT = "{#dirName}";

	private static final String GERBER_FILE = "gerber.file";
	private static final String GERBER_FILE_SCRIPT = "{#fileName}";

	private static final String GERBER_OUT = "gerber.out";
	private static final String GERBER_OUT_SCRIPT = "{#gerberout}";

	private static final String VB_EXE_PATH = "vb.exe.path";

	/* bigBoard props */
	private static final String LEAGUE_BOARD_NUM_SCRIPT = "{#LEAGUE_BOARD_NUM}";
	private static final String WIDTH_SCRIPT = "{#WIDTH}";
	private static final String P_SCRIPT = "{#P}" ;
	private static final String DY_SCRIPT = "{#DY}";

	private Properties BIGBOARD_PROPS = null;

	public Generate4YYQTPScript(Properties bigboard_props) {
		super();
		BIGBOARD_PROPS = bigboard_props;

		try {
			GERBER_CONFIG_PROPS = ConfigHelper
					.getConfigProperties(GERBER_CONFIG);
			GERBER_FORMAT_DIALOG_TITLE = GERBER_CONFIG_PROPS.getProperty(
					GERBER_FILE_DRIVER).trim().toUpperCase()
					+ "\\"
					+ GERBER_CONFIG_PROPS.getProperty(GERBER_FILE_DIR).trim()
							.toUpperCase()
					+ "\\"
					+ GERBER_CONFIG_PROPS.getProperty(GERBER_FILE).trim()
							.toUpperCase();
			GERBER_FORMAT_DIALOG_TITLE = GERBER_FORMAT_DIALOG_TITLE.substring(0, 17) ;
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws IOException {
		Properties bigboard_props = new Properties() ;
		bigboard_props.setProperty("{#LEAGUE_BOARD_NUM}", String.valueOf(4)) ;
		bigboard_props.setProperty("{#WIDTH}", String.valueOf(new Double("54"))) ;
		bigboard_props.setProperty("{#P}", String.valueOf(new Double("2"))) ;
		bigboard_props.setProperty("{#DY}", String.valueOf(new Double("0.00"))) ;
		
		Generate4YYQTPScript generateQTPScript = new Generate4YYQTPScript(bigboard_props);
		generateQTPScript.generateQTPScript();
//		RuntimeUtil.getInstance().run(generateQTPScript.getVBEXE(), 1, 50000);
	}

	public String getCheckOutFilePath() {
		return GERBER_CONFIG_PROPS.getProperty(GERBER_FILE_DRIVER).trim() + "/"
				+ GERBER_CONFIG_PROPS.getProperty(GERBER_FILE_DIR).trim();
	}

	public String getSavePath() {
		return GERBER_CONFIG_PROPS.getProperty(GERBER_OUT);
	}

	public String getVBEXE() {
		return GERBER_CONFIG_PROPS.getProperty(VB_EXE_PATH);
	}

	/**
	 * Generate QTP Script
	 * 
	 * @return
	 * @throws IOException
	 */
	public File generateQTPScript() throws IOException {
		return generateQTPScript(GERBER_CONFIG_PROPS
				.getProperty(QTP_SCRIPT_OUT), GERBER_CONFIG_PROPS
				.getProperty(QTP_SCRIPT_IN));
	}

	/**
	 * set value to script
	 * 
	 * @param source
	 * @return
	 * @throws IOException
	 */
	private String scriptConvey(String source) throws IOException {
		String _source = source;
		_source = this.replace(this.replace(this.replace(
				this.replace(this.replace(this.replace(this.replace(
						
						_source
						,
						GERBER_FORMAT_DIALOG_TITLE_SCRIPT,
						GERBER_FORMAT_DIALOG_TITLE), GERBER_FILE_SCRIPT,
						GERBER_CONFIG_PROPS.getProperty(GERBER_FILE)),
						GERBER_FILE_DRIVER_SCRIPT, GERBER_CONFIG_PROPS
								.getProperty(GERBER_FILE_DRIVER)),
						GERBER_OUT_SCRIPT, GERBER_CONFIG_PROPS
								.getProperty(GERBER_OUT)),
				GERBER_FILE_DIR_SCRIPT, GERBER_CONFIG_PROPS
						.getProperty(GERBER_FILE_DIR)),
				GERBER_FILE_DRIVER_PATH_SCRIPT, GERBER_CONFIG_PROPS
						.getProperty(GERBER_FILE_DRIVER_PATH)),
				QTP_SYSTEM_PATH_SCRIPT, GERBER_CONFIG_PROPS
						.getProperty(QTP_SYSTEM_PATH));

		if (this.BIGBOARD_PROPS != null) {
			_source = this.replace(this.replace(this.replace(
					
					_source
					
					,
					DY_SCRIPT, this.BIGBOARD_PROPS.getProperty(DY_SCRIPT)),
					WIDTH_SCRIPT, this.BIGBOARD_PROPS
							.getProperty(WIDTH_SCRIPT)),
					LEAGUE_BOARD_NUM_SCRIPT, this.BIGBOARD_PROPS
							.getProperty(LEAGUE_BOARD_NUM_SCRIPT));
			
			_source = this.replace(_source, P_SCRIPT, this.BIGBOARD_PROPS.getProperty(P_SCRIPT)) ;
		}

		return _source;
	}

	/**
	 * Generate QTP Script
	 * 
	 * @param target
	 *            target file
	 * @param source
	 *            source file
	 * @throws IOException
	 */
	public File generateQTPScript(File target, File source) throws IOException {
		return generateQTPScript(target.getAbsolutePath(), source
				.getAbsolutePath());
	}

	/**
	 * Generate QTP Script
	 * 
	 * @param target
	 *            target file path
	 * @param source
	 *            source file path
	 * @return
	 * @throws IOException
	 */
	public File generateQTPScript(String target, String source)
			throws IOException {
		File f = new File(target);
		if (!f.exists()) {
			f.getParentFile().mkdirs();
			try {
				f.createNewFile();
			} catch (Exception e) {
			}
		}
		FileOutputStream fos = null;
		OutputStreamWriter osw = null;
		BufferedWriter bw = null;
		try {
			final byte[] bom = new byte[] { (byte)0xEF, (byte)0xBB, (byte)0xBF };
			fos = new FileOutputStream(f);
			osw = new OutputStreamWriter(fos, ENCODING);
			bw = new BufferedWriter(osw);
			fos.write(bom);
			bw.write(scriptConvey(getSourceFileContentReader(source)));

			bw.flush();
			bw.close();
			return f;
		} catch (IOException e) {
			throw e;
		}
	}

	/**
	 * Reader convey to string
	 * 
	 * @param source
	 * @return
	 * @throws IOException
	 */
	private String reader2String(Reader source) throws IOException {
		BufferedReader bufferedReader = new BufferedReader(source);
		StringBuffer result = new StringBuffer();
		String buffer = null;
		while ((buffer = bufferedReader.readLine()) != null) {
			result.append(buffer + "\n");
		}
		return result.toString();
	}

	/**
	 * 
	 * @param source
	 *            file path
	 * @return
	 * @throws IOException
	 */
	private Reader getReader(String source) throws IOException {
		return source == "" ? null : new BufferedReader(new InputStreamReader(
				getInputStream(source)));
	}

	/**
	 * get script file content string
	 * 
	 * @param source
	 * @return
	 * @throws IOException
	 */
	private String getSourceFileContentReader(String source) throws IOException {
		return source == "" ? "" : reader2String(getReader(source));
	}

	/**
	 * get inputstream
	 * 
	 * @param source
	 *            file path
	 * @return
	 * @throws IOException
	 */
	private InputStream getInputStream(String source) throws IOException {
		return source == "" ? null : new FileInputStream(new File(source));
	}

	/**
	 * Replace all occurences of a substring within a string with another
	 * string.
	 * 
	 * @param inString
	 *            String to examine
	 * @param oldPattern
	 *            String to replace
	 * @param newPattern
	 *            String to insert
	 * @return a String with the replacements
	 */
	private String replace(String inString, String oldPattern, String newPattern) {
		if (!hasLength(inString) || !hasLength(oldPattern)
				|| newPattern == null) {
			return inString;
		}
		StringBuilder sb = new StringBuilder();
		int pos = 0;
		int index = inString.indexOf(oldPattern);
		int patLen = oldPattern.length();
		while (index >= 0) {
			sb.append(inString.substring(pos, index));
			sb.append(newPattern);
			pos = index + patLen;
			index = inString.indexOf(oldPattern, pos);
		}
		sb.append(inString.substring(pos));
		return sb.toString();
	}

	private boolean hasLength(String str) {
		return hasLength((CharSequence) str);
	}

	private boolean hasLength(CharSequence str) {
		return (str != null && str.length() > 0);
	}
}

你可能感兴趣的:(utf-8)