目标:
pinyin4j中收录了很多的词,但是也有一些词语是未被收录的,目前想做到的效果是
能将为收录的词语收录进去,而且还不需要重新启动服务
===================================================
源码观看
//测试用调用的方法 String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format); static public String[] toHanyuPinyinStringArray(char ch, HanyuPinyinOutputFormat outputFormat) throws BadHanyuPinyinOutputFormatCombination { return getFormattedHanyuPinyinStringArray(ch, outputFormat); } static private String[] getFormattedHanyuPinyinStringArray(char ch, HanyuPinyinOutputFormat outputFormat) throws BadHanyuPinyinOutputFormatCombination { String[] pinyinStrArray = getUnformattedHanyuPinyinStringArray(ch); if (null != pinyinStrArray) { for (int i = 0; i < pinyinStrArray.length; i++) { pinyinStrArray[i] = PinyinFormatter.formatHanyuPinyin(pinyinStrArray[i], outputFormat); } return pinyinStrArray; } else return ARR_EMPTY; } //这里初始化是很重要的一步 private static String[] getUnformattedHanyuPinyinStringArray(char ch) { return ChineseToPinyinResource.getInstance().getHanyuPinyinStringArray(ch); } //单例类 static ChineseToPinyinResource getInstance() { return ChineseToPinyinResourceHolder.theInstance; } /** * Singleton implementation helper. */ private static class ChineseToPinyinResourceHolder { static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource(); } //初始化过程 private ChineseToPinyinResource() { initializeResource(); } /** * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs */ private void initializeResource() { try { final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt"; final String resourceMultiName = "/pinyindb/multi_pinyin.txt"; setUnicodeToHanyuPinyinTable(new Trie()); getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));//加载单个词语--我做的就是再加载一次自定义的文件 getUnicodeToHanyuPinyinTable().loadMultiPinyin( ResourceHelper.getResourceInputStream(resourceMultiName));//加载联想词 getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();//加载用户扩展多音词 } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } } /** *主要内容都在这个类中 */ public class Trie { private Hashtable<String, Trie> values = new Hashtable<String, Trie>();//本节点包含的值 private String pinyin;//本节点的拼音 private Trie nextTire;//下一个节点,也就是匹配下一个字符 public String getPinyin() { return pinyin; } public void setPinyin(String pinyin) { this.pinyin = pinyin; } public Trie getNextTire() { return nextTire; } public void setNextTire(Trie nextTire) { this.nextTire = nextTire; } /** * 加载拼音 * * @param inStream 拼音文件输入流 * @throws IOException */ public synchronized void load(InputStream inStream) throws IOException { BufferedReader bufferedReader = null; InputStreamReader inputStreamReader = null; try { inputStreamReader = new InputStreamReader(inStream); bufferedReader = new BufferedReader(inputStreamReader); String s; while ((s = bufferedReader.readLine()) != null) { String[] keyAndValue = s.split(" "); if (keyAndValue.length != 2) continue; Trie trie = new Trie(); trie.pinyin = keyAndValue[1]; put(keyAndValue[0], trie); } } finally { if (inputStreamReader != null) inputStreamReader.close(); if (bufferedReader != null) bufferedReader.close(); } } /** * 加载多音字拼音词典 * * @param inStream 拼音文件输入流 */ public synchronized void loadMultiPinyin(InputStream inStream) throws IOException { BufferedReader bufferedReader = null; InputStreamReader inputStreamReader = null; try { inputStreamReader = new InputStreamReader(inStream); bufferedReader = new BufferedReader(inputStreamReader); String s; while ((s = bufferedReader.readLine()) != null) { String[] keyAndValue = s.split(" "); if (keyAndValue.length != 2) continue; String key = keyAndValue[0];//多于一个字的字符串 String value = keyAndValue[1];//字符串的拼音 char[] keys = key.toCharArray(); Trie currentTrie = this; for (int i = 0; i < keys.length; i++) { String hexString = Integer.toHexString(keys[i]).toUpperCase(); Trie trieParent = currentTrie.get(hexString); if (trieParent == null) {//如果没有此值,直接put进去一个空对象 currentTrie.put(hexString, new Trie()); trieParent = currentTrie.get(hexString); } Trie trie = trieParent.getNextTire();//获取此对象的下一个 if (keys.length - 1 == i) {//最后一个字了,需要把拼音写进去 trieParent.pinyin = value; break;//此行其实并没有意义 } if (trie == null) { if (keys.length - 1 != i) { //不是最后一个字,写入这个字的nextTrie,并匹配下一个 Trie subTrie = new Trie(); trieParent.setNextTire(subTrie); subTrie.put(Integer.toHexString(keys[i + 1]).toUpperCase(), new Trie()); currentTrie = subTrie; } } else { currentTrie = trie; } } } } finally { if (inputStreamReader != null) inputStreamReader.close(); if (bufferedReader != null) bufferedReader.close(); } } /** * 加载用户自定义的扩展词库 */ public void loadMultiPinyinExtend() throws IOException { String path = MultiPinyinConfig.multiPinyinPath; if (path != null) { File userMultiPinyinFile = new File(path); if (userMultiPinyinFile.exists()) { loadMultiPinyin(new FileInputStream(userMultiPinyinFile)); } } } public Trie get(String hexString) { return values.get(hexString); } public void put(String s, Trie trie) { values.put(s, trie); } }
准备工作--测试文件的读写方式--加载的
package com.git.pinyin; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.lang3.StringUtils; /** * 读取配置文件中信息到map中并且能够写入信息到文件中和map中 * @author songqinghu * 可以软加载的方法 定时去加载 一次配置文件 */ public class ReadAndWriteTest { private final static Map<String,String> dict = new HashMap<String,String>(); public static void main(String[] args) throws IOException { String path = "/pinyindb/gome_hanyu_pinyin_ext.txt"; // readText(path); // HashMap<String, String> map = new HashMap<String,String>(); // map.put("我是好人", "wo"); // map.put("我", "wo"); // map.put("是", "shi"); //map.put("好", "hao,ren"); //writeText(map); unicodeTohanzi("3007"); // 龦 9FA6 cháng //这个词典里没有 一会用她测试 } private static void unicodeTohanzi(String unicode){ int code = Integer.parseInt(unicode, 16); System.out.println((char)code); } /** * * @描述:将汉字和拼音写入文件中 汉字 拼音集合 xxx,xxx,xxx * @param content * @return void * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws IOException */ public static void writeText(Map<String,String> content) throws IOException{ String path = ReadAndWriteTest.class.getResource("/pinyindb/gome_hanyu_pinyin_ext.txt").getPath(); System.out.println(path); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, true))); Set<Entry<String, String>> entrySet = content.entrySet(); String line = null; //writer.newLine(); for (Entry<String, String> entry : entrySet) { String key = entry.getKey(); char[] charArray = key.toCharArray(); for (int i = 0; i < charArray.length; i++) { if(i!=0){ break; } key = Integer.toHexString(charArray[i]); } line = key +" (" +entry.getValue()+")"; writer.write(line); writer.newLine(); } writer.flush(); writer.close(); System.out.println("===="); } /** * * @描述:读取文件中信息到map中 * @param path 文件路径 格式 "/pinyindb/gome_hanyu_pinyin_ext.txt" * @return void * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws IOException */ private static synchronized void readText(String path){ if(StringUtils.isNotBlank(path)){ BufferedReader reader = new BufferedReader(new InputStreamReader(ReadAndWriteTest.class.getResourceAsStream(path))); if(reader != null){ String line=null; try { while((line = reader.readLine())!=null){//读取一行到String中 String[] values = line.split(" "); if(values.length != 2){ continue; } String unicode = values[0]; int code = Integer.parseInt(unicode, 16); char ch = (char) code; String pinyin = values[1]; System.out.println("编码后的字符: " + unicode + " 对应的拼音:"+ pinyin); System.out.println(ch); } } catch (IOException e) { e.printStackTrace(); }finally { if(reader !=null){ try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } } } }
从git上下载后,改写的源码地址
链接:http://pan.baidu.com/s/1i5zvYfz 密码:zodu
package com.gome.mx.plus.pinyin.ext; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.Set; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; /** * 输入汉字获取对应拼音的工具类 * @author songqinghu * */ public class PYReadUtils { /** * * @描述:输入汉字获取对应的全拼 可能是多音字 返回为数组类型 ---如果该汉字查不到则返回null * @param words * @return * @return String[] * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws BadHanyuPinyinOutputFormatCombination */ public static String[] getFullPY(String words) throws BadHanyuPinyinOutputFormatCombination{ StringBuffer buffer = new StringBuffer(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); char[] chars = words.toCharArray(); for (char c : chars) { if(c>128){//汉字 String[] results = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat); for (int i = 0; i < results.length; i++) { buffer.append(results[i]); if(results.length - 1 != i){ buffer.append(","); } } buffer.append(" "); }//不是汉字 --不处理 直接过滤掉 } //所有汉字都变成了拼音 转换组合一下 将拼音拼凑起来 return combination(buffer.toString()); } //拼音封装去重复 private static String[] combination(String all){ ArrayList<Map<String, Integer>> list = new ArrayList<Map<String,Integer>>(); String[] words = all.split(" ");//切为每个词 for (String word : words) { String[] pys = word.split(",");//切出来每个词的每个拼音 HashMap<String, Integer> map = new HashMap<String,Integer>(); for (String py : pys) {// if(map.containsKey(py)){//去除重复拼音 Integer count = map.get(py); map.put(py, count+1); }else{ map.put(py,1); } } list.add(map);//拼音顺序保持正确 } //所有拼音处理完毕---进行拼凑 return midMakeUp(list); } //组合拼音 private static String[] midMakeUp(ArrayList<Map<String, Integer>> list){ HashMap<String, Integer> firsts = null; for (Map<String, Integer> map : list) { HashMap<String, Integer> temp = new HashMap<String,Integer>(); if(firsts !=null){//如果不是第一次--考虑组合问题 for (String str : firsts.keySet()) { for (String st : map.keySet()) { temp.put(str + st, 1);//组合 } } if(temp != null && temp.size()>0){//清理容器 做容器转换 firsts.clear(); } }else{//如果是第一次 for (String str : map.keySet()) { temp.put(str, 1); } } if(temp !=null && temp.size()>0){ firsts = temp; } } //组合结束---调用方法转为string[] return toStringArr(firsts); } private static String[] toStringArr(Map<String,Integer> map){ if(map !=null && map.size()>0){ String[] strs = new String[map.size()]; Set<String> keySet = map.keySet(); int i = 0; for (String key : keySet) { strs[i] = key; i++; } return strs; } return null; } }
package com.gome.mx.plus.pinyin.ext; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import net.sourceforge.pinyin4j.ChineseToPinyinResource; import net.sourceforge.pinyin4j.multipinyin.Trie; /** * 将汉语和拼音写入指定的文件中--文件位置可以指定 * 并且能够动态的加载 不需要重启服务 * 还能指定是否重新写 还是追加的方式 * 还能够将原来已经存在的拼音合并过来--可以指定 * @author songqinghu * */ public class PYWriterUtils { private static String path = "/pinyindb/gome_hanyu_pinyin_ext.txt"; /** * @描述:获取配置文件的位置 * @return void * @exception * @createTime:2016年4月6日 * @author: songqinghu */ public static void setPath(String path){ PYWriterUtils.path = path; } /** * * @描述:默认写入的方式 设置为追加模式 合并已经存在的拼音为一个 * @param word 汉字 * @param pinyin 拼音 * @param voice 声调 * @return * @return boolean 是否成功 * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws Exception */ public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{ return writerControler(word, pinyin, voice, true, true); } /** * * @描述:可以设置的写入方式 --这里还要增加一个批量写入的功能 本方法只是处理一个汉字 * @param word 汉字 * @param pinyin 拼音 * @param voice 声调 * @param additional 是否追加到文件后 * @param merge 是否合并已经出现的拼音到文件中 * @return * @return boolean * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws Exception * 龦 */ public static boolean writerControler(String word,String pinyin,Integer voice, boolean additional ,boolean merge) throws Exception{ //添加音调 pinyin = pinyin + voice; //配置文件地址 String filePath = PYWriterUtils.class.getResource(path).getPath(); //获取 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filePath, additional))); //写入--16进制 查询 -- if(word !=null && word.length()>0){ char c = word.toCharArray()[0]; if(c>128){//是汉字 String unicode = Integer.toHexString(c).toUpperCase();//编码 if(merge){//如果要合并 Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable(); String before = trie.get(unicode).getPinyin(); before = before.trim().substring(1, before.trim().length()-1);//去除() pinyin = before +Field.COMMA+ pinyin; } pinyin = addSymbol(pinyin); writer.write(unicode+Field.SPACE+pinyin); writer.newLine(); System.out.println(unicode+Field.SPACE+pinyin); } } writer.flush(); writer.close(); return true; } /** * * @描述:当自定义文件需要更新时,调用方法 重新加载自己的配置文件 * @return * @return boolean * @exception * @createTime:2016年4月6日 * @author: songqinghu * @throws IOException */ public static boolean reloadText() throws IOException{ InputStream is = PYWriterUtils.class.getResourceAsStream(path); ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is); return true; } /** * 添加操作符号 */ private static String addSymbol(String pinyin){ return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET; } class Field { static final String LEFT_BRACKET = "("; static final String RIGHT_BRACKET = ")"; static final String COMMA = ","; static final String SPACE = " "; } }
package com.gome.mx.plus.pinyin.ext; public enum Voice { One(1),Two(2),Three(3),Four(4); private final Integer value; Voice(Integer value){ this.value = value; } public Integer getValue(){ return value; } }
说明:默认自定义的文件地址为:path = "/pinyindb/gome_hanyu_pinyin_ext.txt" 即和pinyin4j的字典在相同的目录下
package com.gome.mx.plus.pinyin.ext; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; public class MyTest { public static void main(String[] args) throws BadHanyuPinyinOutputFormatCombination { // String[] fullPY = PYReadUtils.getFullPY("龦"); // for (String string : fullPY) { // System.out.println(string); // } try { //写入全新的字符到文件中 PYWriterUtils.writerControler("骉", "test", Voice.Two.getValue(),true, true); String[] fullPY = PYReadUtils.getFullPY("骉"); if(fullPY == null){ System.out.println("没有查到"); }else{ System.out.println("查到"); for (String string : fullPY) { System.out.println(string); } } PYWriterUtils.reloadText(); String[] full = PYReadUtils.getFullPY("骉"); if(full == null){ System.out.println("没有查到"); }else{ System.out.println("查到"); for (String string : full) { System.out.println(string); } } } catch (Exception e) { e.printStackTrace(); } } }