最近打算做一个基于字符串首字符(汉字取拼音的首字母)进行查询的电话簿WEB项目,由于Web本身并不支持汉字的编码编程,因此需要求助于平台开发工具。
Google上搜到一种实现方法,是用C++实现的。以下是源码:
void GetFirstLetter(CString strName, CString& strFirstLetter)
{
TBYTE ucHigh, ucLow;
int nCode;
CString strRet;
strFirstLetter.Empty();
for (int i=0; i<strName.GetLength(); i++)
{
if ( (TBYTE)strName[i] < 0x80 )
continue;
ucHigh = (TBYTE)strName[i];
ucLow = (TBYTE)strName[i+1];
if ( ucHigh < 0xa1 || ucLow < 0xa1)
continue;
else
// Treat code by section-position as an int type parameter,
// so make following change to nCode.
nCode = (ucHigh - 0xa0) * 100 + ucLow - 0xa0;
FirstLetter(nCode, strRet);
strFirstLetter += strRet;
i++;
}
}
void FirstLetter(int nCode, CString& strLetter)
{
if(nCode >= 1601 && nCode < 1637) strLetter = _T("A");
if(nCode >= 1637 && nCode < 1833) strLetter = _T("B");
if(nCode >= 1833 && nCode < 2078) strLetter = _T("C");
if(nCode >= 2078 && nCode < 2274) strLetter = _T("D");
if(nCode >= 2274 && nCode < 2302) strLetter = _T("E");
if(nCode >= 2302 && nCode < 2433) strLetter = _T("F");
if(nCode >= 2433 && nCode < 2594) strLetter = _T("G");
if(nCode >= 2594 && nCode < 2787) strLetter = _T("H");
if(nCode >= 2787 && nCode < 3106) strLetter = _T("J");
if(nCode >= 3106 && nCode < 3212) strLetter = _T("K");
if(nCode >= 3212 && nCode < 3472) strLetter = _T("L");
if(nCode >= 3472 && nCode < 3635) strLetter = _T("M");
if(nCode >= 3635 && nCode < 3722) strLetter = _T("N");
if(nCode >= 3722 && nCode < 3730) strLetter = _T("O");
if(nCode >= 3730 && nCode < 3858) strLetter = _T("P");
if(nCode >= 3858 && nCode < 4027) strLetter = _T("Q");
if(nCode >= 4027 && nCode < 4086) strLetter = _T("R");
if(nCode >= 4086 && nCode < 4390) strLetter = _T("S");
if(nCode >= 4390 && nCode < 4558) strLetter = _T("T");
if(nCode >= 4558 && nCode < 4684) strLetter = _T("W");
if(nCode >= 4684 && nCode < 4925) strLetter = _T("X");
if(nCode >= 4925 && nCode < 5249) strLetter = _T("Y");
if(nCode >= 5249 && nCode < 5590) strLetter = _T("Z");
}
How to use:
CString strName, strRes;
strName = _T("A李小三");
GetFirstLetter(strName, strRes);
//Then the value of strRes is "LXS".
很快应用到项目中,但很快发现,上述方法竟然不能识别一些汉字的编码。究其原因,是因为上述代码主要是根据汉字的编码是由拼音排序的这个基本道理得来,而且只适用于GB2312编码。对于大字符集的GBK编码,上述方法就无能为力了。
再次搜寻和尝试了好久,有一种解决办法似乎可行。该方法把所有的汉字列在一个大数组中,然后每次都是通过在数组中循环比较的方法得出所在的拼音字母。其效率实在太低了。
难道就真的没有办法了吗?终于在论坛里找到一种完美的解决方案,是在C++Builder中实现的。其代码很简洁,实现的原理也让人难以理解。我修改了一下,在VC 6.0中调试通过。
CString CWebEventsApp::GetFirstLetter(LPCTSTR strName)
{
static int li_SecPosValue[] = {
1601, 1637, 1833, 2078, 2274, 2302, 2433, 2594, 2787, 3106, 3212,
3472, 3635, 3722, 3730, 3858, 4027, 4086, 4390, 4558, 4684, 4925, 5249
};
static char* lc_FirstLetter[] = {
"A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "W", "X", "Y", "Z"
};
static char* ls_SecondSecTable =
"CJWGNSPGCGNE[Y[BTYYZDXYKYGT[JNNJQMBSGZSCYJSYY[PGKBZGY[YWJKGKLJYWKPJQHY[W[DZLSGMRYPYWWCCKZNKYYGTTNJJNYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJTXDTTSQZYCDXXHGCK[PHFFSS[YBGXLPPBYLL[HLXS[ZM[JHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZC[J[WQJBYZPXGZNZCPWHKXHQKMWFBPBYDTJZZKQHY"
"LYGXFPTYJYYZPSZLFCHMQSHGMXXSXJ[[DCSBBQBEFSJYHXWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBNPTSDKDYLHGYMYLCXPY[JNDQJWXQXFYYFJLEJPZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQYRBCJTHZTQFRXQHXMJJCJLXQGJMSHZKBSWYEMYLTXFSYDSWLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCLQKXLHXJRZJMFQHXHWY"
"WSBHTRXXGLHQHFNM[YKLDYXZPYLGG[MTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZHSXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCSYDBDLLSCDDNLKJYKJSYCJLKWHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPZJOQLCDHJJYSPRCHN[NNLHLYYQYHWZPTCZGWWMZFFJQQQQYXACLBHKDJXDGMMY"
"DJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYD[FCXYHLXCHYZJQ[[QAGMNYXPFRKSSBJLYXYSYGLNSCMHZWWMNZJJLXXHCHSY[[TTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLY[DCCWZOCWKCCSBNHCPDYZNFCYYTYCKXKYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQH[PQ[QSCFYMNDMGBWHWLGSLLYSDLMLXPTHMJHWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZP"
"MGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCLSLDCLRPBHZHXYYFHB[GDMYCNQQWLQHJJ[YWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSL[HTZKZJECXJCJNMFBY[SFYWYBJZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJX[YZGWKYPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJB"
"PMLMSXLZJQQHZYJCZYDJWBMYKLDDPMJEGXYHYLXHLQYQHKYCWCJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHGJLJAXFGFJZSLCFDQZLCLGJDJCSNZLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNWCZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTX"
"YXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSBGBMMCJSSCLPQPDXCDYYKY[CJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSSLLYXQSXSBSJSBBSGGHFJLYPMZJNLYYWDQSHZXTYYWHMZYHYWDBXBTLMSYYYFSXJC[DXXLHJHF[SXZQHFZMZCZTQCXZXRTTDJHNNYZQQMNQDMMG[YDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGS"
"QQDDJCMBKZFFXMKDMDSYYSZCMLJDSYNSBRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJ[GYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTDSSXTBNPDKLEYCJNYDDYKZDDHQHSDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKRZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLG[DJLSLYGKDZPZXJ"
"YYZLWCXSZFGWYYDLYHCLJSCMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJY[CNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCJLZCSHLTOLJNMDDDLNGKAQHQHJGYKHEZNMSHRP[QQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMMMYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYYDYJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZK"
"HHGFLBCSMDLDGDZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYGCTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZGSZZQLYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNNSZFDEQ"
"FHBSAQTGYLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ";
CString result;
int H, L, W;
UINT i, stringlen = _tcslen(strName);
int j;
for (i = 0; i < stringlen; i++) {
H = (UCHAR) (strName[i + 0]);
L = (UCHAR) (strName[i + 1]);
if (H < 0xA1 || L < 0xA1) {
result += strName[i];
continue;
} else {
W = (H - 160) * 100 + L - 160;
}
if (W > 1600 && W < 5590) {
for (j = 22; j >= 0; j--) {
if (W >= li_SecPosValue[j]) {
result += lc_FirstLetter[j];
i ++;
break;
}
}
continue;
} else {
i++;
W = (H - 160 - 56) * 94 + L - 161;
if (W >= 0 && W <= 3007)
result += ls_SecondSecTable[W];
else {
result += (char) H;
result += (char) L;
}
}
}
return result;
}
上述代码的特殊之处在于,它使用了一个二级拼音哈希表。该哈希表可能综合了大字符集中汉字的排布规律。具体原理还待进一步研究。