Shpfile属性编码自动识别(C#,Java,C)

Shapefile 是一种常用的矢量数据格式,保存了地理数据的坐标和属性信息。

ArcGIS 10.2.1之前,Shapefile的编码默认为本地编码,国内默认为GBK,这导致在导入国外提供的shp数据时会出现乱码。

Shapefile 在 .dbf 和 .cpg 中有存储编码页信息,可以通过解析这两个文件自动识别编码页,以修正乱码问题。

Shpfile构成

Shapefile 由多个文件构成,最基本的是 .shp.shx.dbf 文件。

  • .shp:存储地理数据的坐标信息。
  • .shx:存储地理数据的位置索引,记录每个地理数据在shp文件中的位置,能够快速定位数据。
  • .dbf:存储地理数据的属性信息,以dBase IV的数据表格式存储。

可选的文件如下:

  • .shp.xml:以xml格式保存元数据。
  • .prj:存储地理坐标系统和投影信息。
  • .cpg:指定.dbf文件的字符编码。
  • .sbn、.sbx:空间索引文件。
  • .ixs:地理编码索引。
  • .mxs:地理编码索引(ODB格式)。
  • .atx:.dbf文件的属性索引。

dbf编码自动识别

.dbf 文件头结构如下:
Shpfile属性编码自动识别(C#,Java,C)_第1张图片
.dbf 文件头中第29个字节(从0开始)表示Language driver ID,其代表的编码页参照以下链接:

http://shapelib.maptools.org/codepage.html?d=1563413688103

节选部分如下:

Shpfile属性编码自动识别(C#,Java,C)_第2张图片
.cpg中明文存储文件编码,例如:windows-1251。

解析编码页时,优先使用.dbf中的Language Driver ID,若无则取.cpg文件中的编码。

c# 源码:

/// 
/// 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码
/// 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103
/// 
/// 
/// 
private System.Text.Encoding GetEncoding(byte languageDriverID)
{
    try
    {
        switch (languageDriverID)
        {
            case 0x00:
            case 0x57:
                // 0x00:读取cpg中的编码,若没有,则默认为utf-8
                // 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
                string cpgPath = Path.Combine(Path.GetDirectoryName(_fileName), Path.GetFileNameWithoutExtension(_fileName) + ".cpg");
                if (File.Exists(cpgPath) == false)
                {
                    if (languageDriverID == 0x00)
                    {
                        return Encoding.UTF8;
                    }
                    else
                    {
                        return Encoding.Default;
                    }
                }
                try
                {
                    using (StreamReader sr = new StreamReader(cpgPath))
                    {
                        string txt = sr.ReadLine();
                        if (string.IsNullOrEmpty(txt))
                        {
                            if (languageDriverID == 0x00)
                            {
                                return Encoding.UTF8;
                            }
                            else
                            {
                                return Encoding.Default;
                            }
                        }
                        return Encoding.GetEncoding(txt);
                    }
                }
                catch (Exception ex)
                {
                    if (languageDriverID == 0x00)
                    {
                        return Encoding.UTF8;
                    }
                    else
                    {
                        return Encoding.Default;
                    }
                }
            case 0x01:
            case 0x09:
            case 0x0B:
            case 0x0D:
            case 0x0F:
            case 0x11:
            case 0x15:
            case 0x18:
            case 0x19:
            case 0x1B:
                return Encoding.GetEncoding(437);//IBM437
            case 0x02:
            case 0x0A:
            case 0x0E:
            case 0x10:
            case 0x12:
            case 0x14:
            case 0x16:
            case 0x1A:
            case 0x1D:
            case 0x25:
            case 0x37:
                return Encoding.GetEncoding(850);//ibm850
            case 0x03:
            case 0x58:
            case 0x59:
                return Encoding.GetEncoding(1252);//Windows-1252
            case 0x04:
                return Encoding.GetEncoding(10000);//macintosh
            case 0x08:
            case 0x17:
            case 0x66:
                return Encoding.GetEncoding(865);//IBM865
            case 0x13:
            case 0x7B:
                return Encoding.GetEncoding(932);//iso-2022-jp
            case 0x1C:
            case 0x6C:
                return Encoding.GetEncoding(863);//IBM863
            case 0x1F:
            case 0x22:
            case 0x23:
            case 0x40:
            case 0x64:
            case 0x87:
                return Encoding.GetEncoding(852);//ibm852
            case 0x24:
                return Encoding.GetEncoding(860);//IBM860
            case 0x26:
            case 0x65:
                return Encoding.GetEncoding(866);//cp866
            case 0x4D:
            case 0x7A:
                return Encoding.GetEncoding(936);//gb2312
            case 0x4E:
            case 0x79:
                return Encoding.GetEncoding(949);//ks_c_5601-1987
            case 0x4F:
            case 0x78:
                return Encoding.GetEncoding(950);//big5
            case 0x50:
            case 0x7C:
                return Encoding.GetEncoding(874);//windows-874
            case 0x67:
                return Encoding.GetEncoding(861);//ibm861
            case 0x68:
                return Encoding.GetEncoding(895);//系统不存在
            case 0x69:
                return Encoding.GetEncoding(620);//系统不存在
            case 0x6A:
            case 0x86:
                return Encoding.GetEncoding(737);//ibm737
            case 0x6B:
            case 0x88:
                return Encoding.GetEncoding(857);//ibm857
            case 0x96:
                return Encoding.GetEncoding(10007);//x-mac-cyrillic
            case 0x97:
                return Encoding.GetEncoding(10029);//x-mac-ce
            case 0x98:
                return Encoding.GetEncoding(10006);//x-mac-greek
            case 0xC8:
                return Encoding.GetEncoding(1250);//windows-1250
            case 0xC9:
                return Encoding.GetEncoding(1251);//windows-1251
            case 0xCA:
                return Encoding.GetEncoding(1254);//windows-1254
            case 0xCB:
                return Encoding.GetEncoding(1253);//windows-1253
            case 0xCC:
                return Encoding.GetEncoding(1257);//windows-1257
            default:
                return Encoding.UTF8;
        }
    }
    catch (Exception ex) 
    {
        return Encoding.UTF8;
    }
}

Java 源码:

/**
 * 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码
 * 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103
 * @param languageDriverID
 * @return
 */
private String GetEncoding(byte languageDriverID) {
    try
    {
        switch (languageDriverID)
        {
            case 0x00:
            case 0x57:
                // 0x00:读取cpg中的编码,若没有,则默认为utf-8
                // 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
                File cpgPath = new File(_fileName.substring(0, _fileName.length() - 3) + "cpg");
                if (cpgPath.exists() == false)
                {
                    return ExportParameters.ENCODING_UTF8;
                }
                try
                {
                    FileInputStream fs = new FileInputStream(cpgPath);
                    DataInputStream ds = new DataInputStream(fs);

                    String txt = ds.readLine();
                    if (TextUtils.isEmpty(txt))
                    {
                        return ExportParameters.ENCODING_UTF8;
                    }
                    return txt.trim();
                }
                catch (Exception ex)
                {
                    return ExportParameters.ENCODING_UTF8;
                }
            case 0x01:
            case 0x09:
            case 0x0B:
            case 0x0D:
            case 0x0F:
            case 0x11:
            case 0x15:
            case 0x18:
            case 0x19:
            case 0x1B:
                return "IBM437";
            case 0x02:
            case 0x0A:
            case 0x0E:
            case 0x10:
            case 0x12:
            case 0x14:
            case 0x16:
            case 0x1A:
            case 0x1D:
            case 0x25:
            case 0x37:
                return "ibm850";
            case 0x03:
            case 0x58:
            case 0x59:
                return "Windows-1252";
            case 0x04:
                return "macintosh";
            case 0x08:
            case 0x17:
            case 0x66:
                return "IBM865";
            case 0x13:
            case 0x7B:
                return "iso-2022-jp";
            case 0x1C:
            case 0x6C:
                return "IBM863";
            case 0x1F:
            case 0x22:
            case 0x23:
            case 0x40:
            case 0x64:
            case (byte) 0x87:
                return "ibm852";
            case 0x24:
                return "IBM860";
            case 0x26:
            case 0x65:
                return "cp866";
            case 0x4D:
            case 0x7A:
                return "gb2312";
            case 0x4E:
            case 0x79:
                return "ks_c_5601-1987";
            case 0x4F:
            case 0x78:
                return "big5";
            case 0x50:
            case 0x7C:
                return "windows-874";
            case 0x67:
                return "ibm861";
            case 0x68:
                return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(895);//系统不存在
            case 0x69:
                return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(620);//系统不存在
            case 0x6A:
            case (byte) 0x86:
                return "ibm737";
            case 0x6B:
            case (byte) 0x88:
                return "ibm857";
            case (byte) 0x96:
                return "x-mac-cyrillic";
            case (byte) 0x97:
                return "x-mac-ce";
            case (byte) 0x98:
                return "x-mac-greek";
            case (byte) 0xC8:
                return "windows-1250";
            case (byte) 0xC9:
                return "windows-1251";
            case (byte) 0xCA:
                return "windows-1254";
            case (byte) 0xCB:
                return "windows-1253";
            case (byte) 0xCC:
                return "windows-1257";
            default:
                return ExportParameters.ENCODING_UTF8;
        }
    }
    catch (Exception ex)
    {
        return ExportParameters.ENCODING_UTF8;
    }
}

C 源码

void  getEncoding(unsigned char encodingCode, char *result, const char * cpgPath){
	switch (encodingCode)
	{
		// 0x00:读取cpg中的编码,若没有,则默认为utf-8
		// 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8
	case 0x00:
	case 0x57:
	{
				 char encode[20] = { '\0' };
				 int cpgResult = getCpgEncode(cpgPath, encode);
				 if (cpgResult == 1)
				 {
					 strcpy(result, encode);
				 }
				 else
				 {
					 if (encodingCode == 0x00)
					 {
						 strcpy(result, "UTF-8");
					 }
					 else
					 {
						 getSystemEncode(encode);
						 strcpy(result, encode);
					 }
				 }
				 break;
	}
	case 0x01:
	case 0x09:
	case 0x0B:
	case 0x0D:
	case 0x0F:
	case 0x11:
	case 0x15:
	case 0x18:
	case 0x19:
	case 0x1B:
		strcpy(result, "CP437");
		break;
	case 0x02:
	case 0x0A:
	case 0x0E:
	case 0x10:
	case 0x12:
	case 0x14:
	case 0x16:
	case 0x1A:
	case 0x1D:
	case 0x25:
	case 0x37:
		strcpy(result, "CP850");
		break;
	case 0x03:
	case 0x58:
	case 0x59:
		strcpy(result, "CP1252");
		break;
	case 0x04:
		strcpy(result, "Macintosh");
		break;
	case 0x08:
	case 0x17:
	case 0x66:
		strcpy(result, "CP865");//IBM865
		break;
	case 0x13:
	case 0x7B:
		strcpy(result, "ISO-2022-JP");//iso-2022-jp
		break;
	case 0x1C:
	case 0x6C:
		strcpy(result, "CP863");//IBM863
		break;
	case 0x1F:
	case 0x22:
	case 0x23:
	case 0x40:
	case 0x64:
	case 0x87:
		strcpy(result, "CP852");//ibm852
		break;
	case 0x24:
		strcpy(result, "CP860");
		break;
	case 0x26:
	case 0x65:
		strcpy(result, "CP866");//cp866
		break;
	case 0x4D:
	case 0x7A:
		strcpy(result, "GB18030");//gb2312
		break;
	case 0x4E:
	case 0x79:
		strcpy(result, "ISO-2022-KR");//ks_c_5601-1987
		break;
	case 0x4F:
	case 0x78:
		strcpy(result, "BIG5");//big5
		break;
	case 0x50:
	case 0x7C:
		strcpy(result, "CP874");//windows-874
		break;
	case 0x67:
		strcpy(result, "CP861");//ibm861
		break;
	case 0x68:
		strcpy(result, "UTF-8");//系统不存在
		break;
	case 0x69:
		strcpy(result, "UTF-8");//系统不存在
		break;
	case 0x6A:
	case 0x86:
		strcpy(result, "CP737");//ibm737
		break;
	case 0x6B:
	case 0x88:
		strcpy(result, "CP857");//ibm857
		break;
	case 0x96:
		strcpy(result, "MacCyrillic");//x-mac-cyrillic
		break;
	case 0x97:
		strcpy(result, "MacCentralEurope");//x-mac-ce
		break;
	case 0x98:
		strcpy(result, "MacGreek");//x-mac-greek
		break;
	case 0xC8:
		strcpy(result, "CP1250");//windows-1250
		break;
	case 0xC9:
		strcpy(result, "CP1251");//windows-1251
		break;
	case 0xCA:
		strcpy(result, "CP1254");//windows-1254
		break;
	case 0xCB:
		strcpy(result, "CP1253");//windows-1253
		break;
	case 0xCC:
		strcpy(result, "CP1257");//windows-1257
		break;
	default:
		strcpy(result, "UTF-8");
		break;
	}
}

参考资料:

  • Shape文件的解析
  • shapefile与字符集编码设置

你可能感兴趣的:(GIS)