pinyin4j获取多音字首字母同时保留非中文字符

pinyin4j获取多音字首字母同时保留非中文字符

  • 前情:获取中文的首字母,要求正确识别多音字(例:重庆,重启,重量,成长等),同时需要保留非中文字符
    • 要求项目中导入com.belerweb.pinyin4j.2.5.1包,然后将下面的类放入项目中即可使用
      • ==以下内容暂时还未经过大量数据测试,后续若发现问题会及时修改==

前情:获取中文的首字母,要求正确识别多音字(例:重庆,重启,重量,成长等),同时需要保留非中文字符

当前pinyin4j的最新版2.5.1里面不支持多音字的正确获取首字母(网上找的解决方案大多数也是当遇到多音字时只取第一个拼音),于是扩展了下它的部分源码,支持多音字的首字母获取。

要求项目中导入com.belerweb.pinyin4j.2.5.1包,然后将下面的类放入项目中即可使用

以下内容暂时还未经过大量数据测试,后续若发现问题会及时修改

以下表格为修改记录

修改时间 修改内容
2019-05-28 发布
2020-04-23 修改部分获取首字母异常,加了py.length() > 0判断

如下是重新定义的**PinyinHelper.toHanYuPinyinString()**方法,命名、使用方式与源码一致,使用时需注意正确地导入类名

multi_pinyin.txt是多音字库(pinyin4j源码包里有),可以自己改个名字以及存储路径来扩展里面的多音字,里面并不是全的,比如“重启”需要添加“重启 (chong2,qi3)”才能正确识别

import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;

import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

public class PinyinHelper {

    public static String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat, String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
        ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
        StringBuilder resultPinyinStrBuf = new StringBuilder();
        char[] chars = str.toCharArray();
        for (int i = 0; i < chars.length; i++) {
            // 匹配到的最长的结果
            String result = null;
            char ch = chars[i];
            Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
            int success = i;
            int current = i;
            do {
                String hexStr = Integer.toHexString((int) ch).toUpperCase();
                currentTrie = currentTrie.get(hexStr);
                if (currentTrie != null) {
                    if (currentTrie.getPinyin() != null) {
                        result = currentTrie.getPinyin();
                        success = current;
                    }
                    currentTrie = currentTrie.getNextTire();
                } else {

                }
                current++;
                if (current < chars.length) {
                    ch = chars[current];
                } else {
                    break;
                }
            } while (currentTrie != null);

            // 如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉
            if (result == null) {
                if (retain) {
                    if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
                        resultPinyinStrBuf.append(separate);
                    }
                    resultPinyinStrBuf.append(chars[i]);
                }
            } else {
                String[] pinyinStrArray = resource.parsePinyinString(result);
                if (pinyinStrArray != null) {
                    for (int j = 0; j < pinyinStrArray.length; j++) {
                        if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
                            resultPinyinStrBuf.append(separate);
                        }
                        resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j], outputFormat));
                        // 不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)
                        if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {
                            resultPinyinStrBuf.append(separate);
                        }
                        if (i == success) {
                            break;
                        }
                    }
                }
            }
            i = success;
        }
        return resultPinyinStrBuf.toString();
    }

    static class PinyinFormatter {

        static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
                throws BadHanyuPinyinOutputFormatCombination {
            if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
                    && ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
                    .getVCharType()))) {
                throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");
            }

            if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("[1-5]", "");
            } else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
                pinyinStr = convertToneNumber2ToneMark(pinyinStr);
            }

            if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
            } else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "ü");
            }

            if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
                pinyinStr = pinyinStr.toUpperCase();
            }
            return pinyinStr;
        }

        /**
         * Convert tone numbers to tone marks using Unicode 

* * Algorithm for determining location of tone mark
*

* A simple algorithm for determining the vowel on which the tone mark * appears is as follows:
* *

    *
  1. First, look for an "a" or an "e". If either vowel appears, it takes * the tone mark. There are no possible pinyin syllables that contain both * an "a" and an "e". * *
  2. If there is no "a" or "e", look for an "ou". If "ou" appears, then * the "o" takes the tone mark. * *
  3. If none of the above cases hold, then the last vowel in the syllable * takes the tone mark. * *
* * @param pinyinStr the ascii represention with tone numbers * @return the unicode represention with tone marks */
private static String convertToneNumber2ToneMark(final String pinyinStr) { String lowerCasePinyinStr = pinyinStr.toLowerCase(); if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) { final char defautlCharValue = '$'; final int defautlIndexValue = -1; char unmarkedVowel = defautlCharValue; int indexOfUnmarkedVowel = defautlIndexValue; final char charA = 'a'; final char charE = 'e'; final String ouStr = "ou"; final String allUnmarkedVowelStr = "aeiouv"; final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü"; if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) { int tuneNumber = Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1)); int indexOfA = lowerCasePinyinStr.indexOf(charA); int indexOfE = lowerCasePinyinStr.indexOf(charE); int ouIndex = lowerCasePinyinStr.indexOf(ouStr); if (-1 != indexOfA) { indexOfUnmarkedVowel = indexOfA; unmarkedVowel = charA; } else if (-1 != indexOfE) { indexOfUnmarkedVowel = indexOfE; unmarkedVowel = charE; } else if (-1 != ouIndex) { indexOfUnmarkedVowel = ouIndex; unmarkedVowel = ouStr.charAt(0); } else { for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) { if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches( "[" + allUnmarkedVowelStr + "]")) { indexOfUnmarkedVowel = i; unmarkedVowel = lowerCasePinyinStr.charAt(i); break; } } } if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) { int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel); int columnIndex = tuneNumber - 1; int vowelLocation = rowIndex * 5 + columnIndex; char markedVowel = allMarkedVowelStr.charAt(vowelLocation); return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü") + markedVowel + lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1, lowerCasePinyinStr.length() - 1).replaceAll("v", "ü"); } else // error happens in the procedure of locating vowel { return lowerCasePinyinStr; } } else // input string has no any tune number { // only replace v with ü (umlat) character return lowerCasePinyinStr.replaceAll("v", "ü"); } } else // bad format { return lowerCasePinyinStr; } } } static class ChineseToPinyinResource { /** * A hash table contains pairs */ private Trie unicodeToHanyuPinyinTable = null; /** * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set. */ private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) { this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable; } /** * @return Returns the unicodeToHanyuPinyinTable. */ Trie getUnicodeToHanyuPinyinTable() { return unicodeToHanyuPinyinTable; } /** * Private constructor as part of the singleton pattern. */ private ChineseToPinyinResource() { initializeResource(); } /** * Initialize a hash-table contains pairs */ private void initializeResource() { try { final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt"; final String resourceMultiName = "/pinyindb/multi_pinyin.txt"; setUnicodeToHanyuPinyinTable(new Trie()); getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName)); getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName)); getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend(); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } } Trie getHanyuPinyinTrie(char ch) { String codepointHexStr = Integer.toHexString((int) ch).toUpperCase(); // fetch from hashtable return getUnicodeToHanyuPinyinTable().get(codepointHexStr); } /** * Get the unformatted Hanyu Pinyin representations of the given Chinese * character in array format. * * @param ch given Chinese character in Unicode * @return The Hanyu Pinyin strings of the given Chinese character in array * format; return null if there is no corresponding Pinyin string. */ String[] getHanyuPinyinStringArray(char ch) { String pinyinRecord = getHanyuPinyinRecordFromChar(ch); return parsePinyinString(pinyinRecord); } String[] parsePinyinString(String pinyinRecord) { if (null != pinyinRecord) { int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET); int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET); String stripedString = pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(), indexOfRightBracket); return stripedString.split(Field.COMMA); } else { // no record found or mal-formatted record return null; } } /** * @param record given record string of Hanyu Pinyin * @return return true if record is not null and record is not "none0" and * record is not mal-formatted, else return false */ private boolean isValidRecord(String record) { final String noneStr = "(none0)"; return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET) && record.endsWith(Field.RIGHT_BRACKET); } /** * @param ch given Chinese character in Unicode * @return corresponding Hanyu Pinyin Record in Properties file; null if no * record found */ private String getHanyuPinyinRecordFromChar(char ch) { // convert Chinese character to code point (integer) // please refer to http://www.unicode.org/glossary/#code_point // Another reference: http://en.wikipedia.org/wiki/Unicode int codePointOfChar = ch; String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase(); // fetch from hashtable Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr); String foundRecord = null; if (trie != null) { foundRecord = trie.getPinyin(); } return isValidRecord(foundRecord) ? foundRecord : null; } /** * Singleton factory method. * * @return the one and only MySingleton. */ static ChineseToPinyinResource getInstance() { return ChineseToPinyinResourceHolder.THE_INSTANCE; } /** * Singleton implementation helper. */ private static class ChineseToPinyinResourceHolder { static final ChineseToPinyinResource THE_INSTANCE = new ChineseToPinyinResource(); } /** * A class encloses common string constants used in Properties files * * @author Li Min ([email protected]) */ class Field { static final String LEFT_BRACKET = "("; static final String RIGHT_BRACKET = ")"; static final String COMMA = ","; } } static class ResourceHelper { /** * @param resourceName * @return resource (mainly file in file system or file in compressed * package) as BufferedInputStream */ static BufferedInputStream getResourceInputStream(String resourceName) { return new BufferedInputStream(ResourceHelper.class.getResourceAsStream(resourceName)); } } }

下面是使用方式: 里面用到了google的guava包的部分内容

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import java.util.List;

/**
 * 拼音工具类
 */
public class PinyinUtil {

    private static HanyuPinyinOutputFormat outputFormat;
    private static final String SEPARATE = "#";

    static {
        outputFormat = new HanyuPinyinOutputFormat();
        outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
        outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    }
    
    /**
     * 获取文本的拼音
     *
     * @param str     需要转换拼音的文本
     * @param retain  true:保留中文以外的其他字符
     * @param initial true:只需要首字母
     * @return 拼音
     */
    public static String toPinYinString(String str, boolean retain, boolean initial) {
        StringBuilder sb = new StringBuilder();
        try {
            List<String> list = Lists.newArrayList();
            StringBuilder notChinese = new StringBuilder();
            for (int i = 0; i < str.length(); i++) {
                if (str.charAt(i) < 0x4E00 || str.charAt(i) > 0x9FA5) {
                    notChinese.append(str.charAt(i));
                    if (i == str.length() - 1) {
                        list.add(notChinese.toString());
                    }
                } else {
                    if (notChinese.length() > 0) {
                        list.add(notChinese.toString());
                        notChinese = new StringBuilder();
                    }
                }
            }
            String pinyin = PinyinHelper.toHanYuPinyinString(str, outputFormat, SEPARATE, retain);
            Splitter.on(SEPARATE).split(pinyin).forEach(py -> {
                if (list.contains(py)) {
                    sb.append(py);
                    return;
                }
                if (initial) {
                	if (py.length() > 0) {
                    	sb.append(py.charAt(0));
                    }
                } else {
                    sb.append(py);
                }
            });
        } catch (BadHanyuPinyinOutputFormatCombination e) {
            e.printStackTrace();
        }
        return sb.toString();
    }
    
}

下面是临时测试结果:

		String str = "成长,重启,重量,长大了,角色,角落,呼啦啦,1我2,3爱4,5你6";
        System.out.println(PinyinUtil.toPinYinString(str, true, true));
        // cz,cq,zl,zdl,js,jl,hll,1w2,3a4,5n6
        System.out.println(PinyinUtil.toPinYinString(str, false, true));
        // czcqzlzdljsjlhllwan
        System.out.println(PinyinUtil.toPinYinString(str, true, false));
        // chengzhang,chongqi,zhongliang,zhangdale,juese,jiaoluo,hulala,1wo2,3ai4,5ni6
        System.out.println(PinyinUtil.toPinYinString(str, false, false));
        // chengzhangchongqizhongliangzhangdalejuesejiaoluohulalawoaini

你可能感兴趣的:(Tools,代码优化)