C#将汉字转换为拼音首字母

关于这个话题以前曾经使用过一个简便的算法很长时间, 代码如下:
        private  string ToPinyinSingle( string str)
        {
             if (str.CompareTo( " ") <  0)
                 return str;
             if (str.CompareTo( " ") <  0)
                 return  " a ";
             if (str.CompareTo( " ") <  0)
                 return  " b ";
             if (str.CompareTo( " ") <  0)
                 return  " c ";
             if (str.CompareTo( " ") <  0)
                 return  " d ";
             if (str.CompareTo( " ") <  0)
                 return  " e ";
             if (str.CompareTo( " ") <  0)
                 return  " f ";
             if (str.CompareTo( " ") <  0)
                 return  " g ";
             if (str.CompareTo( " ") <  0)
                 return  " h ";
             if (str.CompareTo( " ") <  0)
                 return  " j ";
             if (str.CompareTo( " ") <  0)
                 return  " k ";
             if (str.CompareTo( " ") <  0)
                 return  " l ";
             if (str.CompareTo( " ") <  0)
                 return  " m ";
             if (str.CompareTo( " ") <  0)
                 return  " n ";
             if (str.CompareTo( " ") <  0)
                 return  " o ";
             if (str.CompareTo( " ") <  0)
                 return  " p ";
             if (str.CompareTo( " ") <  0)
                 return  " q ";
             if (str.CompareTo( " ") <  0)
                 return  " r ";
             if (str.CompareTo( " ") <  0)
                 return  " s ";
             if (str.CompareTo( " ") <  0)
                 return  " t ";
             if (str.CompareTo( " ") <  0)
                 return  " w ";
             if (str.CompareTo( " ") <  0)
                 return  " x ";
             if (str.CompareTo( " ") <  0)
                 return  " y ";
             if (str.CompareTo( " ") <  0)
                 return  " z ";
             return str;

        } 

这个函数只处理单个汉字, 简单地加个循环就可以让它处理文字串了.

在.net 3.5下, 它一直工作得很好, 虽然偶尔也有出错的时候, 但是概率极低, 基本上可以忽略不计.

然而后来我把项目升级到.net 4.0以后, 发现出错的几率直线上升, 已经高得无法容忍的程度了(例如, "梅" 会返回"L"), 简单查了一下, 没找到微软关于String.CompareTo函数有什么变化的说明, 束手无策, 于是换用另一个也很简单的算法(http://topic.csdn.net/u/20090219/12/61745e3a-a39e-4f4d-8985-67d124236694.html):

static  public  string getSpell( string cn)
{
byte[] arrCN = System.Text.Encoding.Default.GetBytes(cn);
if(arrCN.Length >  1)
{
int area = ( short)arrCN[ 0];
int pos = ( short)arrCN[ 1];
int code = (area<< 8) + pos;
int[] areacode = { 45217, 45253, 45761, 46318, 46826, 47010, 47297, 47614, 48119, 48119, 49062, 49324, 49896, 50371, 50614, 50622, 50906, 51387, 51446, 52218, 52698, 52698, 52698, 52980, 53689, 54481};
for( int i= 0;i< 26;i++)
{
int max =  55290;
if(i !=  25) max = areacode[i+ 1];
if(areacode[i]<=code && code{
return System.Text.Encoding.Default.GetString( new  byte[]{( byte)( 65+i)});
}
}
return  " ? ";
}
else  return cn;

} 

但是这个函数出错的概率也很高, 例如"闫""窦""圳" 等都无法识别, 追查了一下原因, 发现原来对GB2312编码来说, 存放规定是这样的:

01-09区为特殊符号。 

16-55区为一级汉字,按拼音排序。 
56-87区为二级汉字,按部首/笔画排序。
每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”,第二个字节称为“低位字节”。
“高位字节”使用了0xA1-0xF7(把01-87区的区号加上0xA0),“低位字节”使用了0xA1-0xFE(把01-94加上0xA0)。
例如“啊”字在大多数程序中,会以0xB0A1储存。(与区位码对比:0xB0=0xA0+16,0xA1=0xA0+1)


上述几个字位置码都大于55290, 显然是二级汉字, 这个算法就处理不了了, 换言之, 这种写法只能用于处理一级汉字. 这当然是不可接受的. 

后来翻查良久, 终于找到一个用C++写的算法, 可以同时处理一级汉字和二级汉字(http://download.csdn.net/detail/ronjay/1955072), 我把它改写成了C#, 代码如下: 

         public  class ChineseToPinYin
        {
             #region " 全局变量 "

             private  static  string[] _regionChar =  new  string[ 32]
            {
                 " CJWGNSPGCGNESYPBTYYZDXYKYGTDJNNJQMBSGZSCYJSYYQPGKBZGYCYWJKGKLJSWKPJQHYTWDDZLSGMRYPYWWCCKZNKYDG ",
                 " TTNGJEYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJZXDTTSQZYCDXXHGCKBPHFFSSWYBGMXLPBYLLLHLX ",
                 " SPZMYJHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZCDJZWQJBDZBXGZNZCPWHKXHQKMWFBPBY ",
                 " DTJZZKQHYLYGXFPTYJYYZPSZLFCHMQSHGMXXSXJJSDCSBBQBEFSJYHWWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBN ",
                 " PTSDKDYLHGYMYLCXPYCJNDQJWXQXFYYFJLEJBZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQTRBCJTHZTQFRXQHXMJJCJLX ",
                 " QGJMSHZKBSWYEMYLTXFSYDSGLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCL ",
                 " QKXLHXJRZJMFQHXHWYWSBHTRXXGLHQHFNMNYKLDYXZPWLGGTMTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZH ",
                 " ZXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCS ",
                 " YDBDLLSCDDNLKJYKJSYCJLKOHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPCJOQLCDHJJYSPRCHNWJNLHLYYQYYWZPTCZG ",
                 " WWMZFFJQQQQYXACLBHKDJXDGMMYDJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYDCFCXYHLXCHYZJQSFQAGMNYXPFRKSSBJLYXY ",
                 " SYGLNSCMHCWWMNZJJLXXHCHSYDSTTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLYSDCCWZOCWKCCSBNHCPDYZNFCYYTYCKX ",
                 " KYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQHZPQSQSCFYMMDMGBWHWLGSSLYSDLMLXPTHMJ ",
                 " HWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZPMGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCL ",
                 " SLDCLRPBHZHXYYFHBBGDMYCNQQWLQHJJZYWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSLXHTZKZJECXJCJNMFBYCSFYWYB ",
                 " JZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJXJYZGWK ",
                 " YPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJBPMLMSXLZJQQHZYJCZYDJWBMJKLDDPMJEGXYHYLXHLQYQHKYCW ",
                 " CJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHHJLJAXFGFJZSLCFDQZ ",
                 " LCLGJDJCSNCLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNW ",
                 " CZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTXYXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSB ",
                 " GBMMCJSSCLPQPDXCDYYKYFCJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSS ",
                 " LLYXQSXSBSJSBBSGGHFJLWPMZJNLYYWDQSHZXTYYWHMCYHYWDBXBTLMSYYYFSXJCSDXXLHJHFSSXZQHFZMZCZTQCXZXRTT ",
                 " DJHNNYZQQMNQDMMGYYDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGSQQDDJCMBKZFFXMKDMDSYYSZCMLJDSYN ",
                 " SPRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJJGYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTTSSXTBNPDKLEYCJNYCDYKZDDHQH ",
                 " SDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKR ",
                 " ZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLGBDJLSLYGKDZPZXJYYZLWCXSZFGWYYDLYHCLJS ",
                 " CMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJYYCNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCGLZCSHLTOLJNMDDDLNGKAQHQH ",
                 " JGYKHEZNMSHRPHQQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMM ",
                 " MYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYJDJJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZKHHGFLBCSMDLDG ",
                 " DZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYG ",
                 " CTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZ ",
                 " GSZZQJYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNDSZFDEQFHBS ",
                 " AQTGLLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ "
            };
             private  static System.Text.Encoding _encoding = System.Text.Encoding.GetEncoding( " GB2312 ");

             #endregion

             private  static  bool In( int lp,  int hp,  int value)
            {
                 return ((value <= hp) && (value >= lp));
            }
             public  static  char GetFirstChar( string chineseChar)
            {
                 var bytes = _encoding.GetBytes(chineseChar);
                 if (bytes.Length !=  2)
                     return chineseChar[ 0];
                 return GetChar(bytes[ 0], bytes[ 1], chineseChar);
            }
             private  static  char GetChar( byte c1,  byte c2,  string originChar)
            {
                 var Hi = c1 <<  8;
                 var Lo = c2;
                 int n = Hi + Lo;
                 if (n <=  0xD7F9)
                {
                     if (In( 0xB0A10xB0C4, n))  return  ' A ';
                     if (In( 0XB0C50XB2C0, n))  return  ' B ';
                     if (In( 0xB2C10xB4ED, n))  return  ' C ';
                     if (In( 0xB4EE0xB6E9, n))  return  ' D ';
                     if (In( 0xB6EA0xB7A1, n))  return  ' E ';
                     if (In( 0xB7A20xB8C0, n))  return  ' F ';
                     if (In( 0xB8C10xB9FD, n))  return  ' G ';
                     if (In( 0xB9FE0xBBF6, n))  return  ' H ';
                     if (In( 0xBBF70xBFA5, n))  return  ' J ';
                     if (In( 0xBFA60xC0AB, n))  return  ' K ';
                     if (In( 0xC0AC0xC2E7, n))  return  ' L ';
                     if (In( 0xC2E80xC4C2, n))  return  ' M ';
                     if (In( 0xC4C30xC5B5, n))  return  ' N ';
                     if (In( 0xC5B60xC5BD, n))  return  ' O ';
                     if (In( 0xC5BE0xC6D9, n))  return  ' P ';
                     if (In( 0xC6D10xC8BA, n))  return  ' Q ';
                     if (In( 0xC8BB0xC8F5, n))  return  ' R ';
                     if (In( 0xC8F60xCBF9, n))  return  ' S ';
                     if (In( 0xCBFA0xCDD9, n))  return  ' T ';
                     if (In( 0xCDDA0xCEF3, n))  return  ' W ';
                     if (In( 0xCEF40xD1B8, n))  return  ' X ';
                     if (In( 0xD1B90xD4D0, n))  return  ' Y ';
                     if (In( 0xD4D10xD7F9, n))  return  ' Z ';
                     return originChar[ 0];
                }
                 else
                {
                     var b1 = (c1 &  0x7F) -  0x20 -  56;
                     var b2 = (c2 &  0x7F) -  0x20 -  1;
                     if (b1 >=  0 && b1 <=  31 && b2 >=  0 && b2 <=  93)
                    {
                         return _regionChar[b1][b2];
                    }
                     return originChar[ 0];
                }
            }

        } 

 

 这个算法目前还没有发现哪个汉字会出错. 

 

 

 

 

转载于:https://www.cnblogs.com/Moosdau/archive/2011/12/06/2277727.html

你可能感兴趣的:(C#将汉字转换为拼音首字母)