当前pinyin4j的最新版2.5.1里面不支持多音字的正确获取首字母(网上找的解决方案大多数也是当遇到多音字时只取第一个拼音),于是扩展了下它的部分源码,支持多音字的首字母获取。
以下表格为修改记录
修改时间 | 修改内容 |
---|---|
2019-05-28 | 发布 |
2020-04-23 | 修改部分获取首字母异常,加了py.length() > 0判断 |
如下是重新定义的**PinyinHelper.toHanYuPinyinString()**方法,命名、使用方式与源码一致,使用时需注意正确地导入类名
multi_pinyin.txt是多音字库(pinyin4j源码包里有),可以自己改个名字以及存储路径来扩展里面的多音字,里面并不是全的,比如“重启”需要添加“重启 (chong2,qi3)”才能正确识别
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
public class PinyinHelper {
public static String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat, String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
StringBuilder resultPinyinStrBuf = new StringBuilder();
char[] chars = str.toCharArray();
for (int i = 0; i < chars.length; i++) {
// 匹配到的最长的结果
String result = null;
char ch = chars[i];
Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
int success = i;
int current = i;
do {
String hexStr = Integer.toHexString((int) ch).toUpperCase();
currentTrie = currentTrie.get(hexStr);
if (currentTrie != null) {
if (currentTrie.getPinyin() != null) {
result = currentTrie.getPinyin();
success = current;
}
currentTrie = currentTrie.getNextTire();
} else {
}
current++;
if (current < chars.length) {
ch = chars[current];
} else {
break;
}
} while (currentTrie != null);
// 如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉
if (result == null) {
if (retain) {
if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
resultPinyinStrBuf.append(separate);
}
resultPinyinStrBuf.append(chars[i]);
}
} else {
String[] pinyinStrArray = resource.parsePinyinString(result);
if (pinyinStrArray != null) {
for (int j = 0; j < pinyinStrArray.length; j++) {
if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
resultPinyinStrBuf.append(separate);
}
resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j], outputFormat));
// 不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)
if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {
resultPinyinStrBuf.append(separate);
}
if (i == success) {
break;
}
}
}
}
i = success;
}
return resultPinyinStrBuf.toString();
}
static class PinyinFormatter {
static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
throws BadHanyuPinyinOutputFormatCombination {
if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
&& ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
.getVCharType()))) {
throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");
}
if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
pinyinStr = pinyinStr.replaceAll("[1-5]", "");
} else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
pinyinStr = pinyinStr.replaceAll("u:", "v");
pinyinStr = convertToneNumber2ToneMark(pinyinStr);
}
if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
pinyinStr = pinyinStr.replaceAll("u:", "v");
} else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
pinyinStr = pinyinStr.replaceAll("u:", "ü");
}
if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
pinyinStr = pinyinStr.toUpperCase();
}
return pinyinStr;
}
/**
* Convert tone numbers to tone marks using Unicode
*
* Algorithm for determining location of tone mark
*
* A simple algorithm for determining the vowel on which the tone mark
* appears is as follows:
*
*
* - First, look for an "a" or an "e". If either vowel appears, it takes
* the tone mark. There are no possible pinyin syllables that contain both
* an "a" and an "e".
*
*
- If there is no "a" or "e", look for an "ou". If "ou" appears, then
* the "o" takes the tone mark.
*
*
- If none of the above cases hold, then the last vowel in the syllable
* takes the tone mark.
*
*
*
* @param pinyinStr the ascii represention with tone numbers
* @return the unicode represention with tone marks
*/
private static String convertToneNumber2ToneMark(final String pinyinStr) {
String lowerCasePinyinStr = pinyinStr.toLowerCase();
if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {
final char defautlCharValue = '$';
final int defautlIndexValue = -1;
char unmarkedVowel = defautlCharValue;
int indexOfUnmarkedVowel = defautlIndexValue;
final char charA = 'a';
final char charE = 'e';
final String ouStr = "ou";
final String allUnmarkedVowelStr = "aeiouv";
final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";
if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {
int tuneNumber =
Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));
int indexOfA = lowerCasePinyinStr.indexOf(charA);
int indexOfE = lowerCasePinyinStr.indexOf(charE);
int ouIndex = lowerCasePinyinStr.indexOf(ouStr);
if (-1 != indexOfA) {
indexOfUnmarkedVowel = indexOfA;
unmarkedVowel = charA;
} else if (-1 != indexOfE) {
indexOfUnmarkedVowel = indexOfE;
unmarkedVowel = charE;
} else if (-1 != ouIndex) {
indexOfUnmarkedVowel = ouIndex;
unmarkedVowel = ouStr.charAt(0);
} else {
for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {
if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches(
"[" + allUnmarkedVowelStr + "]")) {
indexOfUnmarkedVowel = i;
unmarkedVowel = lowerCasePinyinStr.charAt(i);
break;
}
}
}
if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {
int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);
int columnIndex = tuneNumber - 1;
int vowelLocation = rowIndex * 5 + columnIndex;
char markedVowel = allMarkedVowelStr.charAt(vowelLocation);
return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü")
+ markedVowel
+ lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,
lowerCasePinyinStr.length() - 1).replaceAll("v", "ü");
} else
// error happens in the procedure of locating vowel
{
return lowerCasePinyinStr;
}
} else
// input string has no any tune number
{
// only replace v with ü (umlat) character
return lowerCasePinyinStr.replaceAll("v", "ü");
}
} else
// bad format
{
return lowerCasePinyinStr;
}
}
}
static class ChineseToPinyinResource {
/**
* A hash table contains pairs
*/
private Trie unicodeToHanyuPinyinTable = null;
/**
* @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.
*/
private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {
this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;
}
/**
* @return Returns the unicodeToHanyuPinyinTable.
*/
Trie getUnicodeToHanyuPinyinTable() {
return unicodeToHanyuPinyinTable;
}
/**
* Private constructor as part of the singleton pattern.
*/
private ChineseToPinyinResource() {
initializeResource();
}
/**
* Initialize a hash-table contains pairs
*/
private void initializeResource() {
try {
final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
final String resourceMultiName = "/pinyindb/multi_pinyin.txt";
setUnicodeToHanyuPinyinTable(new Trie());
getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));
getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName));
getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
Trie getHanyuPinyinTrie(char ch) {
String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();
// fetch from hashtable
return getUnicodeToHanyuPinyinTable().get(codepointHexStr);
}
/**
* Get the unformatted Hanyu Pinyin representations of the given Chinese
* character in array format.
*
* @param ch given Chinese character in Unicode
* @return The Hanyu Pinyin strings of the given Chinese character in array
* format; return null if there is no corresponding Pinyin string.
*/
String[] getHanyuPinyinStringArray(char ch) {
String pinyinRecord = getHanyuPinyinRecordFromChar(ch);
return parsePinyinString(pinyinRecord);
}
String[] parsePinyinString(String pinyinRecord) {
if (null != pinyinRecord) {
int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);
int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);
String stripedString =
pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),
indexOfRightBracket);
return stripedString.split(Field.COMMA);
} else {
// no record found or mal-formatted record
return null;
}
}
/**
* @param record given record string of Hanyu Pinyin
* @return return true if record is not null and record is not "none0" and
* record is not mal-formatted, else return false
*/
private boolean isValidRecord(String record) {
final String noneStr = "(none0)";
return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)
&& record.endsWith(Field.RIGHT_BRACKET);
}
/**
* @param ch given Chinese character in Unicode
* @return corresponding Hanyu Pinyin Record in Properties file; null if no
* record found
*/
private String getHanyuPinyinRecordFromChar(char ch) {
// convert Chinese character to code point (integer)
// please refer to http://www.unicode.org/glossary/#code_point
// Another reference: http://en.wikipedia.org/wiki/Unicode
int codePointOfChar = ch;
String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();
// fetch from hashtable
Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);
String foundRecord = null;
if (trie != null) {
foundRecord = trie.getPinyin();
}
return isValidRecord(foundRecord) ? foundRecord : null;
}
/**
* Singleton factory method.
*
* @return the one and only MySingleton.
*/
static ChineseToPinyinResource getInstance() {
return ChineseToPinyinResourceHolder.THE_INSTANCE;
}
/**
* Singleton implementation helper.
*/
private static class ChineseToPinyinResourceHolder {
static final ChineseToPinyinResource THE_INSTANCE = new ChineseToPinyinResource();
}
/**
* A class encloses common string constants used in Properties files
*
* @author Li Min ([email protected])
*/
class Field {
static final String LEFT_BRACKET = "(";
static final String RIGHT_BRACKET = ")";
static final String COMMA = ",";
}
}
static class ResourceHelper {
/**
* @param resourceName
* @return resource (mainly file in file system or file in compressed
* package) as BufferedInputStream
*/
static BufferedInputStream getResourceInputStream(String resourceName) {
return new BufferedInputStream(ResourceHelper.class.getResourceAsStream(resourceName));
}
}
}
下面是使用方式: 里面用到了google的guava包的部分内容
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import java.util.List;
/**
* 拼音工具类
*/
public class PinyinUtil {
private static HanyuPinyinOutputFormat outputFormat;
private static final String SEPARATE = "#";
static {
outputFormat = new HanyuPinyinOutputFormat();
outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
}
/**
* 获取文本的拼音
*
* @param str 需要转换拼音的文本
* @param retain true:保留中文以外的其他字符
* @param initial true:只需要首字母
* @return 拼音
*/
public static String toPinYinString(String str, boolean retain, boolean initial) {
StringBuilder sb = new StringBuilder();
try {
List<String> list = Lists.newArrayList();
StringBuilder notChinese = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
if (str.charAt(i) < 0x4E00 || str.charAt(i) > 0x9FA5) {
notChinese.append(str.charAt(i));
if (i == str.length() - 1) {
list.add(notChinese.toString());
}
} else {
if (notChinese.length() > 0) {
list.add(notChinese.toString());
notChinese = new StringBuilder();
}
}
}
String pinyin = PinyinHelper.toHanYuPinyinString(str, outputFormat, SEPARATE, retain);
Splitter.on(SEPARATE).split(pinyin).forEach(py -> {
if (list.contains(py)) {
sb.append(py);
return;
}
if (initial) {
if (py.length() > 0) {
sb.append(py.charAt(0));
}
} else {
sb.append(py);
}
});
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return sb.toString();
}
}
下面是临时测试结果:
String str = "成长,重启,重量,长大了,角色,角落,呼啦啦,1我2,3爱4,5你6";
System.out.println(PinyinUtil.toPinYinString(str, true, true));
// cz,cq,zl,zdl,js,jl,hll,1w2,3a4,5n6
System.out.println(PinyinUtil.toPinYinString(str, false, true));
// czcqzlzdljsjlhllwan
System.out.println(PinyinUtil.toPinYinString(str, true, false));
// chengzhang,chongqi,zhongliang,zhangdale,juese,jiaoluo,hulala,1wo2,3ai4,5ni6
System.out.println(PinyinUtil.toPinYinString(str, false, false));
// chengzhangchongqizhongliangzhangdalejuesejiaoluohulalawoaini