iconv 文件编码转换

                                                             iconv 文件编码转换

http://www.cnblogs.com/xuxm2007/archive/2010/11/09/1872379.html

http://qq164587043.blog.51cto.com/261469/63349

linux shell 配置文件中默认的字符集编码为UTF-8 。UTF-8是unicode的一种表达方式,gb2312是和unicode都是字符的编码方式,所以说gb2312跟utf-8的概念应该不是一个层次上的。在LINUX上进行编码转换时,可以利用iconv命令实现,这是针对文件的,即将指定文件从一种编码转换为另一种编码。

查了下iconv命令用法如下:

iconv [选项...]  [文件...]

有如下选项可用:

输入/输出格式规范:
-f, --from-code=名称 原始文本编码
-t, --to-code=名称 输出编码

信息:
-l, --list 列举所有已知的字符集

输出控制:
-c 从输出中忽略无效的字符
-o, --output=FILE 输出文件
-s, --silent 关闭警告
--verbose 打印进度信息

iconv -f utf-8 -t gb2312  /server_test/reports/software_.txt >  /server_test/reports/software_asserts.txt

iconv函数族的头文件是iconv.h,使用前需包含之。
#include < iconv.h>
iconv函数族有三个函数,原型如下:
(1) iconv_t iconv_open(const char  *tocode, const char  *fromcode);
此函数说明将要进行哪两种编码的转换,tocode是目标编码,fromcode是原编码,该函数返回一个转换句柄,供以下两个函数使用。
(2)  size_t iconv(iconv_t cd,char **inbuf,size_t *inbytesleft,char **outbuf,size_t  *outbytesleft);
此函数从inbuf中读取字符,转换后输出到outbuf中,inbytesleft用以记录还未转换的字符数,outbytesleft用以记录输出缓冲的剩余空间。 (3) int iconv_close(iconv_t cd);
此函数用于关闭转换句柄,释放资源。
例子1: 用C语言实现的转换示例程序

/* f.c : 代码转换示例C程序 */
#include < iconv.h>
#define OUTLEN 255
main()
{
char *in_utf8 =  "姝e?ㄥ??瑁?";
char *in_gb2312 = "正在安装";
char  out[OUTLEN];

//unicode码转为gb2312码
rc =  u2g(in_utf8,strlen(in_utf8),out,OUTLEN);
printf("unicode-->gb2312  out=%sn",out);
//gb2312码转为unicode码
rc =  g2u(in_gb2312,strlen(in_gb2312),out,OUTLEN);
printf("gb2312-->unicode  out=%sn",out);
}
//代码转换:从一种编码转为另一种编码
int code_convert(char  *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int  outlen)
{
iconv_t cd;
int rc;
char **pin = &inbuf;
char  **pout = &outbuf;

cd = iconv_open(to_charset,from_charset);
if  (cd==0) return -1;
memset(outbuf,0,outlen);
if  (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;
iconv_close(cd);
return 0;
}
//UNICODE码转为GB2312码
int u2g(char  *inbuf,int inlen,char *outbuf,int outlen)
{
return  code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);
}
//GB2312码转为UNICODE码
int  g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
return  code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);
}

例子2: 用C++语言实现的转换示例程序

/* f.cpp : 代码转换示例C++程序 */
#include < iconv.h>
#include <iostream>

#define OUTLEN  255

using namespace std;

// 代码转换操作类
class CodeConverter  {
private:
iconv_t cd;
public:
// 构造
CodeConverter(const char  *from_charset,const char *to_charset) {
cd =  iconv_open(to_charset,from_charset);
}

// 析构
~CodeConverter()  {
iconv_close(cd);
}

// 转换输出
int convert(char *inbuf,int  inlen,char *outbuf,int outlen) {
char **pin = &inbuf;
char **pout = & outbuf;

memset(outbuf,0,outlen);
return iconv(cd,pin,(size_t  *)&inlen,pout,(size_t *)&outlen);
}
};

int main(int argc,  char **argv)
{
char *in_utf8 = "姝e?ㄥ??瑁?";
char *in_gb2312 =  "正在安装";
char out[OUTLEN];

// utf-8-->gb2312
CodeConverter cc =  CodeConverter("utf-8","gb2312");
cc.convert(in_utf8,strlen(in_utf8),out,OUTLEN);
cout << "utf-8-->gb2312 in=" << in_utf8 << ",out=" << out << endl;

// gb2312-->utf-8
CodeConverter cc2 =  CodeConverter("gb2312","utf-8");
cc2.convert(in_gb2312,strlen(in_gb2312),out,OUTLEN);
cout << "gb2312-->utf-8 in=" << in_gb2312 << ",out=" <<  out << endl;
}

iconv的支持的编码有

$ iconv -l
437, 500, 500V1, 850, 851, 852, 855, 856, 857, 860, 861, 862, 863, 864, 865, 866, 866NAV, 869, 874, 904, 1026, 1046, 1047, 8859_1, 8859_2, 8859_3, 8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, 10646-1:1993, 10646-1:1993/UCS4/
ANSI_X3.4-1968, ANSI_X3.4-1986, ANSI_X3.4,  ANSI_X3.110-1983, ANSI_X3.110, ARABIC, ARABIC7, ARMSCII-8,  ASCII, ASMO-708, ASMO_449, BALTIC, BIG-5,  BIG-FIVE, BIG5-HKSCS, BIG5, BIG5HKSCS, BIGFIVE, BS_4730, CA, CN-BIG5, CN-GB,  CN, CP-AR,  CP-GR, CP-HU,  CP037, CP038, CP273, CP274, CP275, CP278, CP280, CP281, CP282, CP284, CP285, CP290, CP297, CP367, CP420, CP423, CP424, CP437, CP500, CP737, CP775, CP813, CP819, CP850, CP851, CP852, CP855, CP856, CP857, CP860, CP861, CP862, CP863, CP864, CP865, CP866, CP866NAV, CP868, CP869, CP870, CP871, CP874, CP875, CP880, CP891, CP903, CP904, CP905, CP912, CP915, CP916, CP918, CP920, CP922, CP930, CP932, CP933, CP935, CP936, CP937, CP939, CP949, CP950, CP1004, CP1026, CP1046, CP1047, CP1070, CP1079, CP1081, CP1084, CP1089, CP1124, CP1125, CP1129, CP1132, CP1133, CP1160, CP1161, CP1162, CP1163, CP1164, CP1250, CP1251, CP1252, CP1253, CP1254, CP1255, CP1256, CP1257, CP1258, CP1361, CP10007, CPIBM861, CSA7-1,  CSA7-2, CSASCII, CSA_T500-1983, CSA_T500, CSA_Z243.4-1985-1,  CSA_Z243.4-1985-2,  CSA_Z243.419851, CSA_Z243.419852, CSDECMCS, CSEBCDICATDE, CSEBCDICATDEA, CSEBCDICCAFR, CSEBCDICDKNO, CSEBCDICDKNOA, CSEBCDICES, CSEBCDICESA, CSEBCDICESS, CSEBCDICFISE, CSEBCDICFISEA, CSEBCDICFR, CSEBCDICIT, CSEBCDICPT, CSEBCDICUK, CSEBCDICUS, CSEUCKR, CSEUCPKDFMTJAPANESE, CSGB2312, CSHPROMAN8, CSIBM037, CSIBM038, CSIBM273, CSIBM274, CSIBM275, CSIBM277, CSIBM278, CSIBM280, CSIBM281, CSIBM284, CSIBM285, CSIBM290, CSIBM297, CSIBM420, CSIBM423, CSIBM424, CSIBM500, CSIBM851, CSIBM855, CSIBM856, CSIBM857, CSIBM860, CSIBM863, CSIBM864, CSIBM865, CSIBM866, CSIBM868, CSIBM869, CSIBM870, CSIBM871, CSIBM880, CSIBM891, CSIBM903, CSIBM904, CSIBM905, CSIBM918, CSIBM922, CSIBM930, CSIBM932, CSIBM933, CSIBM935, CSIBM937, CSIBM939, CSIBM943, CSIBM1026, CSIBM1124, CSIBM1129, CSIBM1132, CSIBM1133, CSIBM1160, CSIBM1161, CSIBM1163, CSIBM1164, CSIBM11621162, CSISO4UNITEDKINGDOM, CSISO10SWEDISH, CSISO11SWEDISHFORNAMES, CSISO14JISC6220RO, CSISO15ITALIAN, CSISO16PORTUGESE, CSISO17SPANISH, CSISO18GREEK7OLD, CSISO19LATINGREEK, CSISO21GERMAN, CSISO25FRENCH, CSISO27LATINGREEK1, CSISO49INIS, CSISO50INIS8, CSISO51INISCYRILLIC, CSISO58GB1988, CSISO60DANISHNORWEGIAN, CSISO60NORWEGIAN1, CSISO61NORWEGIAN2, CSISO69FRENCH, CSISO84PORTUGUESE2, CSISO85SPANISH2, CSISO86HUNGARIAN, CSISO88GREEK7, CSISO89ASMO449, CSISO90, CSISO92JISC62991984B, CSISO99NAPLPS, CSISO103T618BIT, CSISO111ECMACYRILLIC, CSISO121CANADIAN1, CSISO122CANADIAN2, CSISO139CSN369103, CSISO141JUSIB1002, CSISO143IECP271, CSISO150, CSISO150GREEKCCITT, CSISO151CUBA, CSISO153GOST1976874, CSISO646DANISH, CSISO2022CN, CSISO2022JP, CSISO2022JP2, CSISO2022KR, CSISO2033, CSISO5427CYRILLIC, CSISO5427CYRILLIC1981, CSISO5428GREEK, CSISO10367BOX, CSISOLATIN1, CSISOLATIN2, CSISOLATIN3, CSISOLATIN4, CSISOLATIN5, CSISOLATIN6, CSISOLATINARABIC, CSISOLATINCYRILLIC, CSISOLATINGREEK, CSISOLATINHEBREW, CSKOI8R, CSKSC5636, CSMACINTOSH, CSNATSDANO, CSNATSSEFI, CSN_369103, CSPC8CODEPAGE437, CSPC775BALTIC, CSPC850MULTILINGUAL, CSPC862LATINHEBREW, CSPCP852, CSSHIFTJIS, CSUCS4, CSUNICODE, CSWINDOWS31J, CUBA, CWI-2,  CWI, CYRILLIC, DE, DEC-MCS, DEC,  DECMCS, DIN_66003, DK, DS2089, DS_2089, E13B/
EBCDIC-AT-DE-A,  EBCDIC-AT-DE,  EBCDIC-BE, EBCDIC-BR,  EBCDIC-CA-FR,  EBCDIC-CP-AR1, EBCDIC-CP-AR2, EBCDIC-CP-BE,  EBCDIC-CP-CA,  EBCDIC-CP-CH,  EBCDIC-CP-DK,  EBCDIC-CP-ES,  EBCDIC-CP-FI,  EBCDIC-CP-FR,  EBCDIC-CP-GB,  EBCDIC-CP-GR,  EBCDIC-CP-HE,  EBCDIC-CP-IS,  EBCDIC-CP-IT,  EBCDIC-CP-NL,  EBCDIC-CP-NO,  EBCDIC-CP-ROECE, EBCDIC-CP-SE,  EBCDIC-CP-TR,  EBCDIC-CP-US,  EBCDIC-CP-WT,  EBCDIC-CP-YU,  EBCDIC-CYRILLIC, EBCDIC-DK-NO-A,  EBCDIC-DK-NO,  EBCDIC-ES-A,  EBCDIC-ES-S,  EBCDIC-ES,  EBCDIC-FI-SE-A,  EBCDIC-FI-SE,  EBCDIC-FR, EBCDIC-GREEK, EBCDIC-INT,  EBCDIC-INT1, EBCDIC-IS-FRISS, EBCDIC-IT,  EBCDIC-JP-E,  EBCDIC-JP-KANA, EBCDIC-PT,  EBCDIC-UK, EBCDIC-US,  EBCDICATDE, EBCDICATDEA, EBCDICCAFR, EBCDICDKNO, EBCDICDKNOA, EBCDICES, EBCDICESA, EBCDICESS, EBCDICFISE, EBCDICFISEA, EBCDICFR, EBCDICISFRISS, EBCDICIT, EBCDICPT, EBCDICUK, EBCDICUS, ECMA-114, ECMA-118, ECMA-128, ECMA-CYRILLIC, ECMACYRILLIC, ELOT_928, ES,  ES2, EUC-CN,  EUC-JISX0213, EUC-JP-MS,  EUC-JP,  EUC-KR, EUC-TW,  EUCCN, EUCJP-MS,  EUCJP-OPEN, EUCJP-WIN, EUCJP, EUCKR, EUCTW, FI, FR, GB, GB2312, GB13000, GB18030, GBK, GB_1988-80,  GB_198880, GEORGIAN-ACADEMY, GEORGIAN-PS,  GOST_19768-74, GOST_19768, GOST_1976874, GREEK-CCITT, GREEK, GREEK7-OLD, GREEK7, GREEK7OLD, GREEK8, GREEKCCITT, HEBREW, HP-ROMAN8, HPROMAN8, HU, IBM-856, IBM-922, IBM-930, IBM-932, IBM-933, IBM-935, IBM-937, IBM-939, IBM-943, IBM-1046, IBM-1047, IBM-1124, IBM-1129, IBM-1132, IBM-1133, IBM-1160, IBM-1161, IBM-1162, IBM-1163, IBM-1164, IBM037, IBM038, IBM256, IBM273, IBM274, IBM275, IBM277, IBM278, IBM280, IBM281, IBM284, IBM285, IBM290, IBM297, IBM367, IBM420, IBM423, IBM424, IBM437, IBM500, IBM775, IBM813, IBM819, IBM848, IBM850, IBM851, IBM852, IBM855, IBM856, IBM857, IBM860, IBM861, IBM862, IBM863, IBM864, IBM865, IBM866, IBM866NAV, IBM868, IBM869, IBM870, IBM871, IBM874, IBM875, IBM880, IBM891, IBM903, IBM904, IBM905, IBM912, IBM915, IBM916, IBM918, IBM920, IBM922, IBM930, IBM932, IBM933, IBM935, IBM937, IBM939, IBM943, IBM1004, IBM1026, IBM1046, IBM1047, IBM1089, IBM1124, IBM1129, IBM1132, IBM1133, IBM1160, IBM1161, IBM1162, IBM1163, IBM1164, IEC_P27-1,  IEC_P271, INIS-8,  INIS-CYRILLIC, INIS, INIS8, INISCYRILLIC, ISIRI-3342, ISIRI3342, ISO-2022-CN-EXT, ISO-2022-CN,  ISO-2022-JP-2,  ISO-2022-JP-3,  ISO-2022-JP,  ISO-2022-KR,  ISO-8859-1,  ISO-8859-2,  ISO-8859-3,  ISO-8859-4,  ISO-8859-5,  ISO-8859-6,  ISO-8859-7,  ISO-8859-8,  ISO-8859-9,  ISO-8859-10,  ISO-8859-11,  ISO-8859-13,  ISO-8859-14,  ISO-8859-15,  ISO-8859-16,  ISO-10646, ISO-10646/UCS2/
ISO-10646/UCS4/
ISO-10646/UTF-8/
ISO-10646/UTF8/
ISO-CELTIC, ISO-IR-4,  ISO-IR-6,  ISO-IR-8-1,  ISO-IR-9-1,  ISO-IR-10,  ISO-IR-11,  ISO-IR-14,  ISO-IR-15,  ISO-IR-16,  ISO-IR-17,  ISO-IR-18,  ISO-IR-19,  ISO-IR-21,  ISO-IR-25,  ISO-IR-27,  ISO-IR-37,  ISO-IR-49,  ISO-IR-50,  ISO-IR-51,  ISO-IR-54,  ISO-IR-55,  ISO-IR-57,  ISO-IR-60,  ISO-IR-61,  ISO-IR-69,  ISO-IR-84,  ISO-IR-85,  ISO-IR-86,  ISO-IR-88,  ISO-IR-89,  ISO-IR-90,  ISO-IR-92,  ISO-IR-98,  ISO-IR-99,  ISO-IR-100, ISO-IR-101, ISO-IR-103, ISO-IR-109, ISO-IR-110, ISO-IR-111, ISO-IR-121, ISO-IR-122, ISO-IR-126, ISO-IR-127, ISO-IR-138, ISO-IR-139, ISO-IR-141, ISO-IR-143, ISO-IR-144, ISO-IR-148, ISO-IR-150, ISO-IR-151, ISO-IR-153, ISO-IR-155, ISO-IR-156, ISO-IR-157, ISO-IR-166, ISO-IR-179, ISO-IR-193, ISO-IR-197, ISO-IR-199, ISO-IR-203, ISO-IR-209, ISO-IR-226, ISO646-CA,  ISO646-CA2, ISO646-CN,  ISO646-CU, ISO646-DE,  ISO646-DK, ISO646-ES,  ISO646-ES2, ISO646-FI,  ISO646-FR, ISO646-FR1, ISO646-GB,  ISO646-HU, ISO646-IT,  ISO646-JP-OCR-B,  ISO646-JP,  ISO646-KR, ISO646-NO,  ISO646-NO2, ISO646-PT,  ISO646-PT2, ISO646-SE,  ISO646-SE2, ISO646-US,  ISO646-YU, ISO2022CN, ISO2022CNEXT, ISO2022JP, ISO2022JP2, ISO2022KR, ISO6937, ISO8859-1,  ISO8859-2, ISO8859-3,  ISO8859-4, ISO8859-5,  ISO8859-6, ISO8859-7,  ISO8859-8, ISO8859-9,  ISO8859-10, ISO8859-11,  ISO8859-13, ISO8859-14,  ISO8859-15, ISO8859-16,  ISO88591, ISO88592, ISO88593, ISO88594, ISO88595, ISO88596, ISO88597, ISO88598, ISO88599, ISO885910, ISO885911, ISO885913, ISO885914, ISO885915, ISO885916, ISO_646.IRV:1991, ISO_2033-1983, ISO_2033, ISO_5427-EXT, ISO_5427, ISO_5427:1981, ISO_5427EXT, ISO_5428, ISO_5428:1980, ISO_6937-2,  ISO_6937-2:1983, ISO_6937, ISO_6937:1992, ISO_8859-1,  ISO_8859-1:1987, ISO_8859-2,  ISO_8859-2:1987, ISO_8859-3,  ISO_8859-3:1988, ISO_8859-4,  ISO_8859-4:1988, ISO_8859-5,  ISO_8859-5:1988, ISO_8859-6,  ISO_8859-6:1987, ISO_8859-7,  ISO_8859-7:1987, ISO_8859-7:2003, ISO_8859-8,  ISO_8859-8:1988, ISO_8859-9,  ISO_8859-9:1989, ISO_8859-10,  ISO_8859-10:1992, ISO_8859-14,  ISO_8859-14:1998, ISO_8859-15,  ISO_8859-15:1998, ISO_8859-16,  ISO_8859-16:2001, ISO_9036, ISO_10367-BOX, ISO_10367BOX, ISO_69372, IT, JIS_C6220-1969-RO,  JIS_C6229-1984-B,  JIS_C62201969RO, JIS_C62291984B, JOHAB, JP-OCR-B,JP, JS,  JUS_I.B1.002, KOI-7,  KOI-8, KOI8-R,  KOI8-T, KOI8-U,  KOI8, KOI8R, KOI8U, KSC5636, L1, L2, L3, L4, L5, L6, L7, L8, L10, LATIN-9,  LATIN-GREEK-1,  LATIN-GREEK, LATIN1, LATIN2, LATIN3, LATIN4, LATIN5, LATIN6, LATIN7, LATIN8, LATIN10, LATINGREEK, LATINGREEK1, MAC-CYRILLIC, MAC-IS,  MAC-SAMI, MAC-UK,  MAC, MACCYRILLIC, MACINTOSH, MACIS, MACUK, MACUKRAINIAN, MS-ANSI, MS-ARAB, MS-CYRL, MS-EE,  MS-GREEK, MS-HEBR, MS-MAC-CYRILLIC, MS-TURK, MS932, MS936, MSCP949, MSCP1361, MSMACCYRILLIC, MSZ_7795.3,  MS_KANJI, NAPLPS, NATS-DANO, NATS-SEFI, NATSDANO, NATSSEFI, NC_NC0010, NC_NC00-10,  NC_NC00-10:81,  NF_Z_62-010, NF_Z_62-010_(1973),  NF_Z_62-010_1973, NF_Z_62010, NF_Z_62010_1973, NO, NO2, NS_4551-1,  NS_4551-2, NS_45511, NS_45512, OS2LATIN1, OSF00010001, OSF00010002, OSF00010003, OSF00010004, OSF00010005, OSF00010006, OSF00010007, OSF00010008, OSF00010009, OSF0001000A, OSF00010020, OSF00010100, OSF00010101, OSF00010102, OSF00010104, OSF00010105, OSF00010106, OSF00030010, OSF0004000A, OSF0005000A, OSF05010001, OSF100201A4, OSF100201A8, OSF100201B5, OSF100201F4, OSF100203B5, OSF1002011C, OSF1002011D, OSF1002035D, OSF1002035E, OSF1002035F, OSF1002036B, OSF1002037B, OSF10010001, OSF10020025, OSF10020111, OSF10020115, OSF10020116, OSF10020118, OSF10020122, OSF10020129, OSF10020352, OSF10020354, OSF10020357, OSF10020359, OSF10020360, OSF10020364, OSF10020365, OSF10020366, OSF10020367, OSF10020370, OSF10020387, OSF10020388, OSF10020396, OSF10020402, OSF10020417, PT, PT2, PT154, R8, RK1048, ROMAN8, RUSCII, SE, SE2, SEN_850200_B, SEN_850200_C, SHIFT-JIS, SHIFT_JIS, SHIFT_JISX0213, SJIS-OPEN, SJIS-WIN, SJIS, SS636127, STRK1048-2002, ST_SEV_358-88,  T.61-8BIT, T.61,  T.618BIT, TCVN-5712, TCVN, TCVN5712-1,  TCVN5712-1:1993, TIS-620, TIS620-0,  TIS620.2529-1,  TIS620.2533-0,  TIS620, TS-5881, TSCII, UCS-2,  UCS-2BE, UCS-2LE, UCS-4,  UCS-4BE, UCS-4LE, UCS2, UCS4, UHC, UJIS, UK, UNICODE, UNICODEBIG, UNICODELITTLE, US-ASCII, US, UTF-7,  UTF-8, UTF-16,  UTF-16BE, UTF-16LE, UTF-32,  UTF-32BE, UTF-32LE, UTF7, UTF8, UTF16, UTF16BE, UTF16LE, UTF32, UTF32BE, UTF32LE, VISCII, WCHAR_T, WIN-SAMI-2,  WINBALTRIM, WINDOWS-31J, WINDOWS-874, WINDOWS-936, WINDOWS-1250, WINDOWS-1251, WINDOWS-1252, WINDOWS-1253, WINDOWS-1254, WINDOWS-1255, WINDOWS-1256, WINDOWS-1257, WINDOWS-1258, WINSAMI2, WS2, YU,
$

http://worldant.blog.sohu.com/96069463.html

   在LINUX上进行编码转换时,既可以利用iconv函数族编程实现,也可以利用iconv命令来实现,只不过后者是针对文件的,即将指定文件从一种编码转换为另一种编码。

   (1) 利用iconv函数族进行编码转换

  iconv函数族的头文件是iconv.h,使用前需包含之。
       #include < iconv.h>
  iconv函数族有三个函数,原型如下:
  • iconv_t iconv_open(const char *tocode, const char *fromcode);
    此函数说明将要进行哪两种编码的转换,tocode是目标编码,fromcode是原编码,该函数返回一个转换句柄,供以下两个函数使用。
  • size_t iconv(iconv_t cd,char **inbuf,size_t *inbytesleft,char  **outbuf,size_t *outbytesleft);
    此函数从inbuf中读取字符,转换后输出到outbuf中,inbytesleft用以记录还未转换的字符数,outbytesleft用以记录输出缓冲的剩余空间。
  • int iconv_close(iconv_t cd);
   此函数用于关闭转换句柄,释放资源。

(2) 利用iconv命令进行编码转换

  iconv命令用于转换指定文件的编码,默认输出到标准输出设备,亦可指定输出文件。

  用法: iconv [选项...] [文件...]

  有如下选项可用:

输入/输出格式规范:
-f, --from-code=名称  原始文本编码
-t, --to-code=名称 输出编码

信息:
-l, --list 列举所有已知的字符集

输出控制:
-c 从输出中忽略无效的字符
-o, --output=FILE 输出文件
-s, --silent 关闭警告
--verbose 打印进度信息

-?, --help 给出该系统求助列表
--usage 给出简要的用法信息
-V, --version 打印程序版本号

例子:
iconv -f utf-8 -t gb2312  aaa.txt > bbb.txt
这个命令读取aaa.txt文件,从utf-8编码转换为gb2312编码,其输出定向到bbb.txt文件。


2.iconv实现通用语言编码转换(c++)
   可以实现对任意的两个iconv支持的语言编码做互相转换,比如GB2312, GBK, GB18030, UTF-8, UTF-16,  BIG5等.
下面这段程序,非常的稳定,测试了超过10万行的数十种编码的文本的转换都没有出问题。

#include < stdio.h >
#include
< stdlib.h >
#include
< string .h >
#include
< iconv.h >

#ifndef ICONV_CONST
# define ICONV_CONST
const
#endif

/* !
对字符串进行语言编码转换
param from  原始编码,比如"GB2312",的按照iconv支持的写
param to      转换的目的编码
param save  转换后的数据保存到这个指针里,需要在外部分配内存
param savelen 存储转换后数据的内存大小
param src      原始需要转换的字符串
param srclen    原始字符串长度
*/
int
convert(
const char * from, const char * to, char * save, int savelen, char * src, int srclen)
{
    iconv_t cd;
   
char    * inbuf = src;
   
char * outbuf = save;
    size_t outbufsize
= savelen;
   
int status = 0 ;
    size_t  savesize
= 0 ;
    size_t inbufsize
= srclen;
   
const char * inptr = inbuf;
    size_t      insize
= inbufsize;
   
char * outptr = outbuf;
    size_t outsize
= outbufsize;
   
    cd
= iconv_open(to, from);
    iconv(cd,NULL,NULL,NULL,NULL);
   
if (inbufsize == 0 ) {
        status
= - 1 ;
       
goto done;
    }
   
while (insize > 0 ) {
        size_t res
= iconv(cd,(ICONV_CONST char ** ) & inptr, & insize, & outptr, & outsize);
       
if (outptr != outbuf) {
           
int saved_errno = errno;
           
int outsize = outptr - outbuf;
            strncpy(save
+ savesize, outbuf, outsize);
            errno
= saved_errno;
        }
       
if (res == (size_t)( - 1 )) {
           
if (errno == EILSEQ) {
               
int one = 1 ;
                iconvctl(cd,ICONV_SET_DISCARD_ILSEQ,
& one);
                status
= - 3 ;
            }
else if (errno == EINVAL) {
               
if (inbufsize == 0 ) {
                    status
= - 4 ;
                   
goto done;
                }
else {
                   
break ;
                }
            }
else if (errno == E2BIG) {
                status
= - 5 ;
               
goto done;
            }
else {
                status
= - 6 ;
               
goto done;
            }
        }
    }
    status
= strlen(save);
done:
    iconv_close(cd);
   
return status;

}



附:关于iconv的能力

It provides support for the encodings:

European languages
ASCII, ISO-8859-{1,2,3,4,5,7,9,10,13,14,15,16}, KOI8-R, KOI8-U, KOI8-RU,  CP{1250,1251,1252,1253,1254,1257}, CP{850,866},  Mac{Roman,CentralEurope,Iceland,Croatian,Romania},  Mac{Cyrillic,Ukraine,Greek,Turkish}, Macintosh
Semitic languages
ISO-8859-{6,8}, CP{1255,1256}, CP862, Mac{Hebrew,Arabic}
Japanese
EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-JP-1
Chinese
EUC-CN, HZ, GBK, CP936, GB18030, EUC-TW, BIG5, CP950, BIG5-HKSCS,  BIG5-HKSCS:2001, BIG5-HKSCS:1999, ISO-2022-CN, ISO-2022-CN-EXT
Korean
EUC-KR, CP949, ISO-2022-KR, JOHAB
Armenian
ARMSCII-8
Georgian
Georgian-Academy, Georgian-PS
Tajik
KOI8-T
Kazakh
PT154, RK1048
Thai
ISO-8859-11, TIS-620, CP874, MacThai
Laotian
MuleLao-1, CP1133
Vietnamese
VISCII, TCVN, CP1258
Platform specifics
HP-ROMAN8, NEXTSTEP
Full Unicode
UTF-8
UCS-2, UCS-2BE, UCS-2LE
UCS-4, UCS-4BE, UCS-4LE
UTF-16,  UTF-16BE, UTF-16LE
UTF-32, UTF-32BE, UTF-32LE
UTF-7
C99, JAVA
Full Unicode, in terms of uint16_t or uint32_t  (with machine dependent endianness and alignment)
UCS-2-INTERNAL, UCS-4-INTERNAL
Locale dependent, in terms of `char' or `wchar_t' (with machine dependent  endianness and alignment, and with OS and locale dependent semantics)
char, wchar_t
The empty encoding name "" is equivalent to "char": it  denotes the locale dependent character encoding.
When configured with  the option --enable-extra-encodings, it also provides support for a  few extra encodings:
European languages
CP{437,737,775,852,853,855,857,858,860,861,863,865,869,1125}
Semitic languages
CP864
Japanese
EUC-JISX0213, Shift_JISX0213, ISO-2022-JP-3
Chinese
BIG5-2003 (experimental)
Turkmen
TDS565
Platform specifics
ATARIST, RISCOS-LATIN1
It can convert from any of these encodings  to any other, through Unicode conversion.

It has also some limited support for transliteration, i.e. when a character  cannot be represented in the target character set, it can be approximated  through one or several similarly looking characters. Transliteration is  activated when "//TRANSLIT" is appended to the target encoding name.




[参考]iconv
http://www.gnu.org/software/libiconv/documentation/libiconv/iconv.1.html
http://www.gnu.org/software/libiconv/

[参考] iconv实现通用语言编码转换
http://www.yuanma.org/data/2008/0503/article_3025.htm

[参考]linux下字符集编码转换轻松实现
http://blog.csdn.net/hnhbdss/archive/2007/11/30/1909456.aspx

你可能感兴趣的:(iconv 文件编码转换)