GB2312中文转拼音

#-*- coding: utf-8 -*-

from bisect import bisect


FIRST_LETTERS = ["a", "b", "c", "d", "e", "f", "g", "h", "j", "k", "l", "m", "n",
          "o", "p", "q", "r", "s", "t", "w", "x", "y", "z"]
FIRST_NUMBERS = [1, 37, 233, 478, 674, 702, 833, 994, 1187, 1506, 1612, 1872,
            2035, 2122, 2130, 2258, 2427, 2486, 2790, 2958, 3084, 3325, 3649]
PINYIN_LETTERS = [
    ['a', 'ai', 'an', 'ang', 'ao'],
    ['ba', 'bai', 'ban', 'bang', 'bao', 'bei', 'ben', 'beng', 'bi', 'bian', 'biao', 'bie', 'bin', 'bing', 'bo', 'bu'],
    ['ca', 'cai', 'can', 'cang', 'cao', 'ce', 'ceng', 'cha', 'chai', 'chan', 'chang', 'chao', 'che', 'chen', 'cheng', 'chi', 'chong', 'chou', 'chu', 'chuan', 'chuang', 'chui', 'chun', 'chuo', 'ci', 'cong', 'cou', 'cu', 'cuan', 'cui', 'cun', 'cuo'],
    ['da', 'dai', 'dan', 'dang', 'dao', 'de', 'deng', 'di', 'dian', 'diao', 'die', 'ding', 'diu', 'dong', 'dou', 'du', 'duan', 'dui', 'dun', 'duo'],
    ['e', 'en', 'er'],
    ['fa', 'fan', 'fang', 'fei', 'fen', 'feng', 'fo', 'fou', 'fu'],
    ['ga', 'gai', 'gan', 'gang', 'gao', 'ge', 'gei', 'gen', 'geng', 'gong', 'gou', 'gu', 'gua', 'guai', 'guan', 'guang', 'gui', 'gun', 'guo'],
    ['ha', 'hai', 'han', 'hang', 'hao', 'he', 'hei', 'hen', 'heng', 'hong', 'hou', 'hu', 'hua', 'huai', 'huan', 'huang', 'hui', 'hun', 'huo'],
    ['ji', 'jia', 'jian', 'jiang', 'jiao', 'jie', 'jin', 'jing', 'jiong', 'jiu', 'ju', 'juan', 'jue', 'jun'],
    ['ka', 'kai', 'kan', 'kang', 'kao', 'ke', 'ken', 'keng', 'kong', 'kou', 'ku', 'kua', 'kuai', 'kuan', 'kuang', 'kui', 'kun', 'kuo'],
    ['la', 'lai', 'lan', 'lang', 'lao', 'le', 'lei', 'leng', 'li', 'lia', 'lian', 'liang', 'liao', 'lie', 'lin', 'ling', 'liu', 'long', 'lou', 'lu', 'lv', 'lue', 'lv', 'lu', 'luan', 'lue', 'lun', 'luo'],
    ['ma', 'mai', 'man', 'mang', 'mao', 'me', 'mei', 'men', 'meng', 'mi', 'mian', 'miao', 'mie', 'min', 'ming', 'miu', 'mo', 'mou', 'mu'],
    ['na', 'nai', 'nan', 'nang', 'nao', 'ne', 'nei', 'nen', 'neng', 'ni', 'nian', 'niang', 'niao', 'nie', 'nin', 'ning', 'niu', 'nong', 'nu', 'nv', 'nuan', 'nue', 'nuo'],
    ['o', 'ou'],
    ['pa', 'pai', 'pan', 'pang', 'pao', 'pei', 'pen', 'peng', 'pi', 'pian', 'piao', 'pie', 'pin', 'ping', 'po', 'pou', 'pu'],
    ['qi', 'qia', 'qian', 'qiang', 'qiao', 'qie', 'qin', 'qing', 'qiong', 'qiu', 'qu', 'quan', 'que', 'qun'],
    ['ran', 'rang', 'rao', 're', 'ren', 'reng', 'ri', 'rong', 'rou', 'ru', 'ruan', 'rui', 'run', 'ruo'],
    ['sa', 'sai', 'san', 'sang', 'sao', 'se', 'sen', 'seng', 'sha', 'shai', 'shan', 'shang', 'shao', 'she', 'shen', 'sheng', 'shi', 'shou', 'shu', 'shua', 'shuai', 'shuan', 'shuang', 'shui', 'shun', 'shuo', 'si', 'song', 'sou', 'su', 'suan', 'sui', 'sun', 'suo'],
    ['ta', 'tai', 'tan', 'tang', 'tao', 'te', 'teng', 'ti', 'tian', 'tiao', 'tie', 'ting', 'tong', 'tou', 'tu', 'tuan', 'tui', 'tun', 'tuo'],
    ['wa', 'wai', 'wan', 'wang', 'wei', 'wen', 'weng', 'wo', 'wu'],
    ['xi', 'xia', 'xian', 'xiang', 'xiao', 'xie', 'xin', 'xing', 'xiong', 'xiu', 'xu', 'xuan', 'xue', 'xun'],
    ['ya', 'yan', 'yang', 'yao', 'ye', 'yi', 'yin', 'ying', 'yo', 'yong', 'you', 'yu', 'yuan', 'yue', 'yun'],
    ['za', 'zai', 'zan', 'zang', 'zao', 'ze', 'zeng', 'zha', 'zhai', 'zhan', 'zhang', 'zhao', 'zhe', 'zhen', 'zheng', 'zhi', 'zhong', 'zhou', 'zhu', 'zhua', 'zhuai', 'zhuan', 'zhuang', 'zhui', 'zhun', 'zhuo', 'zi', 'zong', 'zou', 'zu', 'zuan', 'zui', 'zun', 'zuo']
]
PINYIN_NUMBERS = [
    [2, 3, 16, 25, 28],
    [37, 55, 63, 78, 90, 113, 128, 132, 138, 162, 174, 178, 182, 188, 203, 222],
    [233, 234, 245, 252, 257, 262, 267, 269, 280, 283, 293, 312, 321, 327, 337, 352, 368, 373, 385, 408, 415, 421, 426, 433, 435, 447, 453, 454, 458, 461, 469, 472],
    [478, 484, 502, 517, 522, 534, 537, 544, 563, 579, 588, 601, 610, 611, 621, 629, 643, 649, 653, 662],
    [674, 687, 688],
    [702, 710, 727, 738, 750, 765, 780, 781, 782],
    [833, 835, 841, 852, 861, 871, 888, 889, 891, 904, 919, 928, 946, 952, 955, 966, 969, 985, 988],
    [994, 1001, 1008, 1027, 1030, 1039, 1057, 1059, 1063, 1068, 1077, 1084, 1108, 1117, 1122, 1136, 1150, 1171, 1177],
    [1187, 1246, 1263, 1309, 1322, 1350, 1377, 1403, 1428, 1430, 1447, 1472, 1479, 1489],
    [1506, 1510, 1515, 1521, 1528, 1532, 1547, 1551, 1553, 1557, 1561, 1568, 1573, 1577, 1579, 1587, 1604, 1608],
    [1612, 1619, 1622, 1637, 1644, 1653, 1655, 1666, 1669, 1709, 1710, 1724, 1735, 1748, 1753, 1765, 1779, 1790, 1805, 1811, 1831, 1842, 1843, 1844, 1845, 1851, 1853, 1860],
    [1872, 1881, 1887, 1902, 1908, 1920, 1921, 1937, 1940, 1948, 1962, 1971, 1979, 1981, 1987, 1993, 1994, 2017, 2020],
    [2035, 2042, 2047, 2050, 2051, 2056, 2057, 2059, 2060, 2061, 2072, 2080, 2081, 2083, 2090, 2091, 2103, 2107, 2111, 2114, 2115, 2116, 2119],
    [2122, 2123],
    [2130, 2136, 2142, 2150, 2155, 2162, 2171, 2173, 2187, 2210, 2214, 2218, 2220, 2225, 2234, 2243, 2244],
    [2258, 2294, 2303, 2325, 2333, 2348, 2353, 2364, 2377, 2379, 2387, 2407, 2417, 2425],
    [2427, 2431, 2436, 2439, 2441, 2451, 2453, 2454, 2464, 2467, 2477, 2479, 2482, 2484],
    [2486, 2489, 2493, 2503, 2506, 2510, 2513, 2514, 2515, 2524, 2526, 2542, 2550, 2561, 2573, 2589, 2606, 2653, 2663, 2702, 2704, 2708, 2710, 2713, 2717, 2721, 2725, 2741, 2749, 2753, 2765, 2768, 2779, 2782],
    [2790, 2805, 2814, 2832, 2845, 2856, 2857, 2861, 2876, 2884, 2889, 2892, 2908, 2921, 2925, 2936, 2938, 2944, 2947],
    [2958, 2965, 2967, 2984, 2994, 3033, 3043, 3046, 3055],
    [3084, 3125, 3138, 3164, 3184, 3208, 3229, 3239, 3254, 3261, 3270, 3289, 3305, 3311],
    [3325, 3341, 3374, 3391, 3412, 3427, 3480, 3502, 3520, 3521, 3536, 3556, 3607, 3627, 3637],
    [3649, 3652, 3659, 3663, 3666, 3680, 3686, 3690, 3710, 3716, 3733, 3748, 3758, 3768, 3784, 3805, 3848, 3859, 3873, 3905, 3907, 3908, 3914, 3921, 3927, 3929, 3940, 3955, 3962, 3966, 3974, 3976, 3980, 3982]
]


def gb2312_pinyin(unichar, first_letter=False):
    assert(isinstance(unichar, unicode))
    gbkchar = unichar.encode("GBK")
    high_code = ord(gbkchar[0]) - 160  #GBK区码
    low_code = ord(gbkchar[1]) - 160   #GBK位码
    char_code = (high_code - 16) * 100 + low_code
    if -1299 <= char_code <= -1206:
        return chr(char_code + 1332) #全角转半角
    elif char_code < 1 or char_code > 3989:
        return "" #不是汉字,或者未被GB2312收录的生僻字

    idx = bisect(FIRST_NUMBERS, char_code)
    if first_letter: #找首字母
        result = FIRST_LETTERS[idx - 1]
    else: #完整拼音
        inidx = bisect(PINYIN_NUMBERS[idx - 1], char_code)
        result = PINYIN_LETTERS[idx - 1][inidx - 1]
    return result


def to_unicode(word):
    if not isinstance(word, unicode):
        try: #尝试当作UTF-8编码转为UNICODE
            word = unicode(word, "UTF-8")
        except UnicodeDecodeError:
            try: #尝试当作GBK编码转为UNICODE
                word = unicode(word, "GBK")
            except UnicodeDecodeError:
                word = ""
    return word


def split_sentence(sentence):
    """ 将中英文混合的句子分割成单个汉字和连续英文 """
    word = ""
    for character in sentence:
        if ord(character) <= 255: #ASCII
            word += character
        else:
            yield True, word
            word = ""
            yield False, character
    yield True, word


def words_to_pinyin(words, first_letter=False, seperator=""):
    uniwords = to_unicode(words)
    letters = []
    for is_ascii, word in split_sentence(uniwords):
        if word:
            if not is_ascii:
                word = gb2312_pinyin(word, first_letter)
            letters.append(word)
    return seperator.join(letters)


def words_pinyin_for_sort(words, first_letter=False):
    uniwords = to_unicode(words)
    word_pinyin = words_to_pinyin(uniwords, first_letter, seperator="~")
    if len(uniwords) > 0 and ord(uniwords[0]) > 255:
        word_pinyin = "~" + word_pinyin
    return word_pinyin


if "__main__"==__name__:
    print words_to_pinyin("好V5的中文", first_letter=True)
    print words_to_pinyin("好V5的中文!", seperator=" ")
    print words_pinyin_for_sort("好V5的中文!")
 

你可能感兴趣的:(中文)