Shapefile 是一种常用的矢量数据格式,保存了地理数据的坐标和属性信息。
ArcGIS 10.2.1之前,Shapefile的编码默认为本地编码,国内默认为GBK,这导致在导入国外提供的shp数据时会出现乱码。
Shapefile 在 .dbf 和 .cpg 中有存储编码页信息,可以通过解析这两个文件自动识别编码页,以修正乱码问题。
Shapefile 由多个文件构成,最基本的是 .shp 、.shx、.dbf 文件。
可选的文件如下:
.dbf 文件头结构如下:
.dbf 文件头中第29个字节(从0开始)表示Language driver ID,其代表的编码页参照以下链接:
http://shapelib.maptools.org/codepage.html?d=1563413688103
节选部分如下:
.cpg中明文存储文件编码,例如:windows-1251。
解析编码页时,优先使用.dbf中的Language Driver ID,若无则取.cpg文件中的编码。
///
/// 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码
/// 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103
///
///
///
private System.Text.Encoding GetEncoding(byte languageDriverID)
{
try
{
switch (languageDriverID)
{
case 0x00:
case 0x57:
// 0x00:读取cpg中的编码,若没有,则默认为utf-8
// 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
string cpgPath = Path.Combine(Path.GetDirectoryName(_fileName), Path.GetFileNameWithoutExtension(_fileName) + ".cpg");
if (File.Exists(cpgPath) == false)
{
if (languageDriverID == 0x00)
{
return Encoding.UTF8;
}
else
{
return Encoding.Default;
}
}
try
{
using (StreamReader sr = new StreamReader(cpgPath))
{
string txt = sr.ReadLine();
if (string.IsNullOrEmpty(txt))
{
if (languageDriverID == 0x00)
{
return Encoding.UTF8;
}
else
{
return Encoding.Default;
}
}
return Encoding.GetEncoding(txt);
}
}
catch (Exception ex)
{
if (languageDriverID == 0x00)
{
return Encoding.UTF8;
}
else
{
return Encoding.Default;
}
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
return Encoding.GetEncoding(437);//IBM437
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
return Encoding.GetEncoding(850);//ibm850
case 0x03:
case 0x58:
case 0x59:
return Encoding.GetEncoding(1252);//Windows-1252
case 0x04:
return Encoding.GetEncoding(10000);//macintosh
case 0x08:
case 0x17:
case 0x66:
return Encoding.GetEncoding(865);//IBM865
case 0x13:
case 0x7B:
return Encoding.GetEncoding(932);//iso-2022-jp
case 0x1C:
case 0x6C:
return Encoding.GetEncoding(863);//IBM863
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case 0x87:
return Encoding.GetEncoding(852);//ibm852
case 0x24:
return Encoding.GetEncoding(860);//IBM860
case 0x26:
case 0x65:
return Encoding.GetEncoding(866);//cp866
case 0x4D:
case 0x7A:
return Encoding.GetEncoding(936);//gb2312
case 0x4E:
case 0x79:
return Encoding.GetEncoding(949);//ks_c_5601-1987
case 0x4F:
case 0x78:
return Encoding.GetEncoding(950);//big5
case 0x50:
case 0x7C:
return Encoding.GetEncoding(874);//windows-874
case 0x67:
return Encoding.GetEncoding(861);//ibm861
case 0x68:
return Encoding.GetEncoding(895);//系统不存在
case 0x69:
return Encoding.GetEncoding(620);//系统不存在
case 0x6A:
case 0x86:
return Encoding.GetEncoding(737);//ibm737
case 0x6B:
case 0x88:
return Encoding.GetEncoding(857);//ibm857
case 0x96:
return Encoding.GetEncoding(10007);//x-mac-cyrillic
case 0x97:
return Encoding.GetEncoding(10029);//x-mac-ce
case 0x98:
return Encoding.GetEncoding(10006);//x-mac-greek
case 0xC8:
return Encoding.GetEncoding(1250);//windows-1250
case 0xC9:
return Encoding.GetEncoding(1251);//windows-1251
case 0xCA:
return Encoding.GetEncoding(1254);//windows-1254
case 0xCB:
return Encoding.GetEncoding(1253);//windows-1253
case 0xCC:
return Encoding.GetEncoding(1257);//windows-1257
default:
return Encoding.UTF8;
}
}
catch (Exception ex)
{
return Encoding.UTF8;
}
}
/**
* 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码
* 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103
* @param languageDriverID
* @return
*/
private String GetEncoding(byte languageDriverID) {
try
{
switch (languageDriverID)
{
case 0x00:
case 0x57:
// 0x00:读取cpg中的编码,若没有,则默认为utf-8
// 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
File cpgPath = new File(_fileName.substring(0, _fileName.length() - 3) + "cpg");
if (cpgPath.exists() == false)
{
return ExportParameters.ENCODING_UTF8;
}
try
{
FileInputStream fs = new FileInputStream(cpgPath);
DataInputStream ds = new DataInputStream(fs);
String txt = ds.readLine();
if (TextUtils.isEmpty(txt))
{
return ExportParameters.ENCODING_UTF8;
}
return txt.trim();
}
catch (Exception ex)
{
return ExportParameters.ENCODING_UTF8;
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
return "IBM437";
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
return "ibm850";
case 0x03:
case 0x58:
case 0x59:
return "Windows-1252";
case 0x04:
return "macintosh";
case 0x08:
case 0x17:
case 0x66:
return "IBM865";
case 0x13:
case 0x7B:
return "iso-2022-jp";
case 0x1C:
case 0x6C:
return "IBM863";
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case (byte) 0x87:
return "ibm852";
case 0x24:
return "IBM860";
case 0x26:
case 0x65:
return "cp866";
case 0x4D:
case 0x7A:
return "gb2312";
case 0x4E:
case 0x79:
return "ks_c_5601-1987";
case 0x4F:
case 0x78:
return "big5";
case 0x50:
case 0x7C:
return "windows-874";
case 0x67:
return "ibm861";
case 0x68:
return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(895);//系统不存在
case 0x69:
return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(620);//系统不存在
case 0x6A:
case (byte) 0x86:
return "ibm737";
case 0x6B:
case (byte) 0x88:
return "ibm857";
case (byte) 0x96:
return "x-mac-cyrillic";
case (byte) 0x97:
return "x-mac-ce";
case (byte) 0x98:
return "x-mac-greek";
case (byte) 0xC8:
return "windows-1250";
case (byte) 0xC9:
return "windows-1251";
case (byte) 0xCA:
return "windows-1254";
case (byte) 0xCB:
return "windows-1253";
case (byte) 0xCC:
return "windows-1257";
default:
return ExportParameters.ENCODING_UTF8;
}
}
catch (Exception ex)
{
return ExportParameters.ENCODING_UTF8;
}
}
void getEncoding(unsigned char encodingCode, char *result, const char * cpgPath){
switch (encodingCode)
{
// 0x00:读取cpg中的编码,若没有,则默认为utf-8
// 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
case 0x00:
case 0x57:
{
char encode[20] = { '\0' };
int cpgResult = getCpgEncode(cpgPath, encode);
if (cpgResult == 1)
{
strcpy(result, encode);
}
else
{
if (encodingCode == 0x00)
{
strcpy(result, "UTF-8");
}
else
{
getSystemEncode(encode);
strcpy(result, encode);
}
}
break;
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
strcpy(result, "CP437");
break;
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
strcpy(result, "CP850");
break;
case 0x03:
case 0x58:
case 0x59:
strcpy(result, "CP1252");
break;
case 0x04:
strcpy(result, "Macintosh");
break;
case 0x08:
case 0x17:
case 0x66:
strcpy(result, "CP865");//IBM865
break;
case 0x13:
case 0x7B:
strcpy(result, "ISO-2022-JP");//iso-2022-jp
break;
case 0x1C:
case 0x6C:
strcpy(result, "CP863");//IBM863
break;
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case 0x87:
strcpy(result, "CP852");//ibm852
break;
case 0x24:
strcpy(result, "CP860");
break;
case 0x26:
case 0x65:
strcpy(result, "CP866");//cp866
break;
case 0x4D:
case 0x7A:
strcpy(result, "GB18030");//gb2312
break;
case 0x4E:
case 0x79:
strcpy(result, "ISO-2022-KR");//ks_c_5601-1987
break;
case 0x4F:
case 0x78:
strcpy(result, "BIG5");//big5
break;
case 0x50:
case 0x7C:
strcpy(result, "CP874");//windows-874
break;
case 0x67:
strcpy(result, "CP861");//ibm861
break;
case 0x68:
strcpy(result, "UTF-8");//系统不存在
break;
case 0x69:
strcpy(result, "UTF-8");//系统不存在
break;
case 0x6A:
case 0x86:
strcpy(result, "CP737");//ibm737
break;
case 0x6B:
case 0x88:
strcpy(result, "CP857");//ibm857
break;
case 0x96:
strcpy(result, "MacCyrillic");//x-mac-cyrillic
break;
case 0x97:
strcpy(result, "MacCentralEurope");//x-mac-ce
break;
case 0x98:
strcpy(result, "MacGreek");//x-mac-greek
break;
case 0xC8:
strcpy(result, "CP1250");//windows-1250
break;
case 0xC9:
strcpy(result, "CP1251");//windows-1251
break;
case 0xCA:
strcpy(result, "CP1254");//windows-1254
break;
case 0xCB:
strcpy(result, "CP1253");//windows-1253
break;
case 0xCC:
strcpy(result, "CP1257");//windows-1257
break;
default:
strcpy(result, "UTF-8");
break;
}
}