1问题描述
最近有人问我问题,说在Java中以UTF-8编码读UTF-8编码的文件test.htm,存入oracle数据库后再读出,一切正常。在Java中UTF-8编码读UTF-8编码的文件test.htm,存入kingbase数据库后再读出,文件中出现乱码,不知为何。
2具体现象
具体现象如下:
Java中读UTF-8编码的文件test.htm,存入数据库后再读出,文件中出现乱码
原文件图如下:
3 存入数据库再读出后出现乱码如下:
注:前面两幅图中“合资公司双方出资比例的问题”前面的那个“・”变成了“ ”。
4 相关代码
下面是模拟这个过程的读写文件和写读数据库的java代码:
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.sql.Clob;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
public class TestEoding {
public static void main(String[] args) throws Exception {
//create table and insert 1 recode
//oracle,kingbase
// create table t_test (id int,txt clob);
// insert into t_test (id,txt) values(1,'');
//pg
// create table t_test (id int,txt text);
// insert into t_test (id,txt) values(1,'');
//kingbase
String driver = "com.kingbase.Driver";
String url = "jdbc:kingbase://127.0.0.1:54321/BEIGANG?ClientEncoding=UNICODE";
String usr = "BEIGANG";
String passwd = "beigang";
String fn = "E:/test.htm";
/*
//pg
String driver = "org.postgresql.Driver";
String url = "jdbc:postgresql://127.0.0.1:5432/beigang";
String usr = "beigang";
String passwd = "beigang";
String fn = "E:/test.htm";
*/
if (args.length > 0) {
fn = args[0];
}
String value = getValue(fn);
testDB(driver, url, usr, passwd, fn + ".kb", value);
/*
//sqlserver
driver = "net.sourceforge.jtds.jdbc.Driver";
url = "jdbc:jtds:sqlserver://127.0.0.1:1433/beigang";
usr = "beigang";
passwd = "beigang";
testDB2(driver, url, usr, passwd, fn + ".mssql", value);
driver = "oracle.jdbc.driver.OracleDriver";
url = "jdbc:oracle:thin:@127.0.0.1:1521:beigang";
usr = "beigang";
passwd = "beigang";
testDB2(driver, url, usr, passwd, fn + ".oracle", value);
*/
}
protected static void testDB(String driver, String url, String usr,
String passwd, String fn, String value) throws SQLException,
Exception {
Class.forName(driver);
Connection conn = DriverManager.getConnection(url, usr, passwd);
String sql = "update t_test set txt = ? where id=1";
PreparedStatement stmt = conn.prepareStatement(sql);
stmt.setString(1, value);
stmt.executeUpdate();
stmt.close();
sql = "select txt from t_test where id=1";
stmt = conn.prepareStatement(sql);
ResultSet rs = stmt.executeQuery();
if (rs.next()) {
writeFile(fn, rs.getString(1));
}
/*
* pg
String sql = "update t_test set txt = '" +value +"' where id=1";
ResultSet rs;
stmt.executeUpdate(sql);
sql = "select txt from t_test where id=1";
rs = stmt.executeQuery(sql);
if (rs.next()) {
writeFile(fn, rs.getString(1));
}
*/
}
protected static void testDB2(String driver, String url, String usr,
String passwd, String fn, String value) throws SQLException,
Exception {
Class.forName(driver);
Connection conn = DriverManager.getConnection(url, usr, passwd);
String sql = "update t_test set txt = ? where id=1";
PreparedStatement stmt = conn.prepareStatement(sql);
stmt.setString(1, value);
stmt.executeUpdate();
stmt.close();
sql = "select txt from t_test where id=1";
stmt = conn.prepareStatement(sql);
ResultSet rs = stmt.executeQuery();
if (rs.next()) {
writeFile(fn, rs.getString(1));
}
}
private static void writeFile(String fn, String string) throws Exception {
FileOutputStream fos = new FileOutputStream(fn);
byte[] content = string.getBytes("UTF-8");
fos.write(content);
fos.flush();
fos.close();
}
private static String getValue(String fn) throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
FileInputStream fins = new FileInputStream(fn);
int b = -1;
while ((b = fins.read()) != -1) {
baos.write(b);
}
return baos.toString("UTF-8");
//return baos.toString("GBK");
}
}
4 结论
结果测试确认,发现oracle、kingbase等数据库中存在同样问题,问题原因是编码"UTF8"的字符0xe383bb(就是“・”)在"GBK"没有相对应物。刚开始说的oracle不出现乱码,而kingbase出现乱码是他搞错了数据库服务器端编码,oracle用来UTF-8的编码,kingbase用了GBK的编码导致。
根本原因是"UTF8"编码的字符0xe383bb(就是" ")在"GBK"没有相对应物。
5 数据库字符集和编码转换
参见我的博客http://blog.csdn.net/beiigang/article/details/7051605
6
把test.htm传上来,压缩文件使test.rar,有兴趣可以实践下
看了一下,不知道怎么把附件传上来,不知道这儿有这个功能没