Java实现简体中文转繁体中文的工具(包括编码转换和语义转换)

简体中文转繁体中文的工具,包括:1、编码转换(GBK->big5) 2、语义转换(根据词库,需要词库的请EMail联系我)

package i18n.converter;



import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.ByteArrayInputStream;

import java.io.File;

import java.io.FileInputStream;



import java.io.FileOutputStream;



import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;



import java.util.Enumeration;

import java.util.Hashtable;

import java.util.Vector;



/**

 * Author: [email protected]

 */



public class Gbk2Big5Converter {

	protected Hashtable s2thash = new Hashtable();



	static String[] dictFiles = new String[] { "mappings_gbk2big5_1-1.txt",

			"mappings_gbk2big5_phrase.txt" };



	String dataline;



	public Gbk2Big5Converter() {

		s2thash = getHashDict();



	}



	/*

	 * 根据GBK的词典生成简转繁的对应关系,包括词组和单字

	 */

	public Hashtable getHashDict() {



		Hashtable hashDict = new Hashtable();



		BufferedReader br = null;



		for (String filename : dictFiles) {

			try {



				InputStream dictStream = getClass().getResourceAsStream(

						filename);

				br = new BufferedReader(

						new InputStreamReader(dictStream, "gbk"));



				String line = null;

				while ((line = br.readLine()) != null) {

					if (line.length() < 3 || line.charAt(0) == '#') {

						continue;

					}

					int idx = line.indexOf(",");

					if (idx > 0) {

						String src = line.substring(0, idx).trim();

						String tgt = line.substring(idx + 1).trim();

						if (hashDict.get(src) != null) {

							hashDict.remove(src);

						}



						hashDict.put(src, tgt);

					}

				}



			} catch (Exception ex) {

				ex.printStackTrace();

			} finally {

				if (br != null) {

					try {

						br.close();

					} catch (IOException e) {

					}

				}



			}

		}



		return hashDict;

	}



	/*

	 * 利用词典对一个字符串进行替换

	 */

	public String convertString(String inline) {

		StringBuffer outline = new StringBuffer(inline);

		convertStringBuffer(outline);

		return outline.toString();

	}



	/*

	 * 利用词典对一个StringBuffer进行替换

	 */

	public void convertStringBuffer(StringBuffer dataline) {

		String lin = dataline.toString();

		// System.out.println("before:" + lin);

		int startPostion = 0;

		String currchar;

		char charvalue;



		for (int beginChar = startPostion; beginChar <= dataline.length(); beginChar++) {



			String newStr = "";

			// System.out.println("开始位置beginChar:" + beginChar);

			// 在该位置下的子串最长度

			int maxLengthOfSubstr = dataline.length() - beginChar;

			// System.out.println("在该位置下的子串最大长度:" + maxLengthOfSubstr);



			// 找出所有子串

			for (int currentLen = maxLengthOfSubstr; currentLen >= 1; currentLen--) {

				// 英文字符不用匹配,直接跳出

				if (isSingleByte(dataline.substring(beginChar, beginChar + 1))) {

					// System.out.println(inputString.substring(beginChar,beginChar+1));

					break;

				}

				// 取得当前子串

				if (beginChar + currentLen <= dataline.length()) {

					// 当前子串

					String subStr = dataline.substring(beginChar, beginChar

							+ currentLen);

					// System.out.println("当前子串:" + subStr);



					if (s2thash.get(subStr) != null) {

						// System.out.println("找到匹配:" + subStr + "->"+

						// s2thash.get(subStr));

						newStr = s2thash.get(subStr).toString();

						dataline.replace(beginChar, beginChar + currentLen,

								s2thash.get(subStr).toString());



						String after = dataline.toString();

						// System.out.println("本次替换后的字符串:" + after);

						if (beginChar + newStr.length() < dataline.length()) {

							// System.out.println("替换完成后开始字符:"+

							// dataline.charAt(beginChar+ newStr.length()));



						} else {

							// System.out.println("本字符串没有新字符可以替换了!");



						}

						beginChar = beginChar + newStr.length() - 1;

						// 找到匹配后,就不用继续往下找本起始字符下的更短的字符串了

						// System.out.println("找到匹配后,就不用继续往下找本起始字符下的更短的字符串了");

						break;



					}



				}



			}



			if (beginChar >= dataline.length())

				break;

		}

		// System.out.println("after:" + dataline.toString());



	}



	/*

	 * 把目标文件或者文件夹(sourcedir,gbk编码)转成big5编码, 并另存为目标文件夹(targetdir,big5编码)

	 */

	public void convertFile(String sourcedir, String targetdir) {

		int source_encoding = 0;

		int target_encoding = 4;

		BufferedReader srcbuffer;

		BufferedWriter outbuffer;

		String dataline;



		Vector inputfiles = new Vector();

		Vector outputfiles = new Vector();

		inputfiles.add(sourcedir);

		outputfiles.add(targetdir);

		int i, j, working_encoding;

		File tmpfile, tmpout;

		String dirfiles[];

		for (i = 0; i < inputfiles.size(); i++) {

			tmpfile = new File((String) inputfiles.get(i));

			if (tmpfile.exists() == false) {

				System.out.println("ERROR: Source file "

						+ (String) inputfiles.get(i) + " does not exist./n");

				continue;

			}

			if (tmpfile.isDirectory() == true) {

				tmpout = new File((String) outputfiles.get(i));

				if (tmpout.exists() == false) {

					tmpout.mkdir();

				}



				dirfiles = tmpfile.list();

				if (dirfiles != null) {

					for (j = 0; j < dirfiles.length; j++) {

						inputfiles.add((String) inputfiles.get(i)

								+ File.separator + dirfiles[j]);

						outputfiles.add((String) outputfiles.get(i)

								+ File.separator + dirfiles[j]);



					}

				}

				continue;

			}



			System.out.println("Converting " + inputfiles.get(i) + " to "

					+ outputfiles.get(i) + " with encoding " + source_encoding);

			try {



				working_encoding = source_encoding;



				srcbuffer = new BufferedReader(new InputStreamReader(

						new FileInputStream((String) inputfiles.get(i)), "gbk"));



				outbuffer = new BufferedWriter(new OutputStreamWriter(

						new FileOutputStream((String) outputfiles.get(i)),

						"big5"));

				while ((dataline = srcbuffer.readLine()) != null) {

					outbuffer.write(convertString(dataline));

					outbuffer.newLine();

				}

				srcbuffer.close();

				outbuffer.close();

			} catch (Exception ex) {

				System.err.println(ex);

			}



		}



	}



	public File convertSimpleString(String inputString) {

		// System.out.println("before->inputString:"+inputString);

		byte[] bytes = inputString.getBytes();

		StringBuffer sb = new StringBuffer();

		// write the string to a temp file

		File result = new File("temp.txt");



		try {

			InputStream inputStream = new ByteArrayInputStream(inputString

					.getBytes());

			BufferedReader srcbuffer = new BufferedReader(

					new InputStreamReader(inputStream, "gbk"));



			BufferedWriter outbuffer = new BufferedWriter(

					new OutputStreamWriter(new FileOutputStream(result), "big5"));



			while ((dataline = srcbuffer.readLine()) != null) {

				outbuffer.write(convertString(dataline));

				outbuffer.newLine();



			}

			srcbuffer.close();

			outbuffer.close();



		} catch (Exception e) {

			e.printStackTrace();

		}

		// System.out.println("after->result:"+result);

		return result;

	}



	/*

	 * 判断是否单字节字,中文都不是单字节字

	 */

	public static boolean isSingleByte(String inStr) {

		if (inStr.getBytes().length == inStr.length()) {



			return true;

		} else {

			return false;

		}



	}



	public void printDict() {

		Gbk2Big5Converter aConverter = new Gbk2Big5Converter();



		for (int i = 20; i > 0; i--) {

			Enumeration enums = aConverter.s2thash.keys();

			while (enums.hasMoreElements()) {

				String ele = (String) enums.nextElement();



				if (ele.length() == i) {

					System.out.print(ele);

					System.out.println("," + aConverter.s2thash.get(ele));

				}



			}



		}

	}



	public static void main(String[] args) {



		Gbk2Big5Converter aGbk2Big5Converter = new Gbk2Big5Converter();

		String src = "src//resource_zh_CN.properties.org";

		String tgt = "src//resource_zh_TW.properties.org";



		System.out.println(new File(src).getAbsolutePath());

		aGbk2Big5Converter.convertFile(src, tgt);



	}



}

你可能感兴趣的:(Java,Utils,Encoding)