solr入门之pinyin4j的源码改写初尝试

目标:

pinyin4j中收录了很多的词,但是也有一些词语是未被收录的,目前想做到的效果是

能将为收录的词语收录进去,而且还不需要重新启动服务

===================================================

源码观看

//测试用调用的方法
String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);

  static public String[] toHanyuPinyinStringArray(char ch, HanyuPinyinOutputFormat outputFormat)
      throws BadHanyuPinyinOutputFormatCombination {
    return getFormattedHanyuPinyinStringArray(ch, outputFormat);
  }


  static private String[] getFormattedHanyuPinyinStringArray(char ch,
      HanyuPinyinOutputFormat outputFormat) throws BadHanyuPinyinOutputFormatCombination {
    String[] pinyinStrArray = getUnformattedHanyuPinyinStringArray(ch);

    if (null != pinyinStrArray) {

      for (int i = 0; i < pinyinStrArray.length; i++) {
        pinyinStrArray[i] = PinyinFormatter.formatHanyuPinyin(pinyinStrArray[i], outputFormat);
      }

      return pinyinStrArray;

    } else
      return ARR_EMPTY;
  }


//这里初始化是很重要的一步
  private static String[] getUnformattedHanyuPinyinStringArray(char ch) {
    return ChineseToPinyinResource.getInstance().getHanyuPinyinStringArray(ch);
  }


//单例类
  static ChineseToPinyinResource getInstance() {
    return ChineseToPinyinResourceHolder.theInstance;
  }

  /**
   * Singleton implementation helper.
   */
  private static class ChineseToPinyinResourceHolder {
    static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource();
  }

//初始化过程
  private ChineseToPinyinResource() {
    initializeResource();
  }

  /**
   * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
   */
  private void initializeResource() {
    try {
      final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
      final String resourceMultiName = "/pinyindb/multi_pinyin.txt";

      setUnicodeToHanyuPinyinTable(new Trie());
      getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));//加载单个词语--我做的就是再加载一次自定义的文件

      getUnicodeToHanyuPinyinTable().loadMultiPinyin(
          ResourceHelper.getResourceInputStream(resourceMultiName));//加载联想词

      getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();//加载用户扩展多音词

    } catch (FileNotFoundException ex) {
      ex.printStackTrace();
    } catch (IOException ex) {
      ex.printStackTrace();
    }
  }



/**
*主要内容都在这个类中
 */
public class Trie {

  private Hashtable<String, Trie> values = new Hashtable<String, Trie>();//本节点包含的值

  private String pinyin;//本节点的拼音

  private Trie nextTire;//下一个节点,也就是匹配下一个字符

  public String getPinyin() {
    return pinyin;
  }

  public void setPinyin(String pinyin) {
    this.pinyin = pinyin;
  }

  public Trie getNextTire() {
    return nextTire;
  }

  public void setNextTire(Trie nextTire) {
    this.nextTire = nextTire;
  }

  /**
   * 加载拼音
   *
   * @param inStream 拼音文件输入流
   * @throws IOException
   */
  public synchronized void load(InputStream inStream) throws IOException {
    BufferedReader bufferedReader = null;
    InputStreamReader inputStreamReader = null;
    try {
      inputStreamReader = new InputStreamReader(inStream);
      bufferedReader = new BufferedReader(inputStreamReader);
      String s;
      while ((s = bufferedReader.readLine()) != null) {
        String[] keyAndValue = s.split(" ");
        if (keyAndValue.length != 2) continue;
        Trie trie = new Trie();
        trie.pinyin = keyAndValue[1];
        put(keyAndValue[0], trie);
      }
    } finally {
      if (inputStreamReader != null) inputStreamReader.close();
      if (bufferedReader != null) bufferedReader.close();
    }
  }

  /**
   * 加载多音字拼音词典
   *
   * @param inStream 拼音文件输入流
   */
  public synchronized void loadMultiPinyin(InputStream inStream) throws IOException {
    BufferedReader bufferedReader = null;
    InputStreamReader inputStreamReader = null;
    try {
      inputStreamReader = new InputStreamReader(inStream);
      bufferedReader = new BufferedReader(inputStreamReader);
      String s;
      while ((s = bufferedReader.readLine()) != null) {
        String[] keyAndValue = s.split(" ");
        if (keyAndValue.length != 2) continue;

        String key = keyAndValue[0];//多于一个字的字符串
        String value = keyAndValue[1];//字符串的拼音
        char[] keys = key.toCharArray();

        Trie currentTrie = this;
        for (int i = 0; i < keys.length; i++) {
          String hexString = Integer.toHexString(keys[i]).toUpperCase();

          Trie trieParent = currentTrie.get(hexString);
          if (trieParent == null) {//如果没有此值,直接put进去一个空对象
            currentTrie.put(hexString, new Trie());
            trieParent = currentTrie.get(hexString);
          }
          Trie trie = trieParent.getNextTire();//获取此对象的下一个

          if (keys.length - 1 == i) {//最后一个字了,需要把拼音写进去
            trieParent.pinyin = value;
            break;//此行其实并没有意义
          }

          if (trie == null) {
            if (keys.length - 1 != i) {
              //不是最后一个字,写入这个字的nextTrie,并匹配下一个
              Trie subTrie = new Trie();
              trieParent.setNextTire(subTrie);
              subTrie.put(Integer.toHexString(keys[i + 1]).toUpperCase(), new Trie());
              currentTrie = subTrie;
            }
          } else {
            currentTrie = trie;
          }

        }
      }
    } finally {
      if (inputStreamReader != null) inputStreamReader.close();
      if (bufferedReader != null) bufferedReader.close();
    }
  }

  /**
   * 加载用户自定义的扩展词库
   */
  public void loadMultiPinyinExtend() throws IOException {
    String path = MultiPinyinConfig.multiPinyinPath;
    if (path != null) {
      File userMultiPinyinFile = new File(path);
      if (userMultiPinyinFile.exists()) {
        loadMultiPinyin(new FileInputStream(userMultiPinyinFile));
      }
    }
  }

  public Trie get(String hexString) {
    return values.get(hexString);
  }

  public void put(String s, Trie trie) {
    values.put(s, trie);
  }
}


准备工作--测试文件的读写方式--加载的

package com.git.pinyin;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;

/**
 * 读取配置文件中信息到map中并且能够写入信息到文件中和map中
 * @author songqinghu
 * 可以软加载的方法  定时去加载 一次配置文件 
 */
public class ReadAndWriteTest {

    private final static Map<String,String> dict = new HashMap<String,String>();
    
    public static void main(String[] args) throws IOException {
        String path  = "/pinyindb/gome_hanyu_pinyin_ext.txt";
       // readText(path);
       // HashMap<String, String> map = new HashMap<String,String>();
       // map.put("我是好人", "wo");
       // map.put("我", "wo");
       // map.put("是", "shi");
        //map.put("好", "hao,ren");
        //writeText(map);
        unicodeTohanzi("3007");  // 龦 9FA6 cháng //这个词典里没有 一会用她测试
    }
    private static void unicodeTohanzi(String unicode){
        
        int code = Integer.parseInt(unicode, 16);
        System.out.println((char)code);
        
    }
    /**
     * 
     * @描述:将汉字和拼音写入文件中  汉字  拼音集合 xxx,xxx,xxx
     * @param content
     * @return void
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws IOException 
     */
    public static void writeText(Map<String,String> content) throws IOException{
        String path = ReadAndWriteTest.class.getResource("/pinyindb/gome_hanyu_pinyin_ext.txt").getPath();
        System.out.println(path);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, true)));
        Set<Entry<String, String>> entrySet = content.entrySet();
        String line = null;
        //writer.newLine();
        for (Entry<String, String> entry : entrySet) {
            String key = entry.getKey();
            char[] charArray = key.toCharArray();
            for (int i = 0; i < charArray.length; i++) {
                if(i!=0){
                    break;
                }
                key = Integer.toHexString(charArray[i]);
            }
            line = key +" (" +entry.getValue()+")";
            writer.write(line);
            writer.newLine();
        }
        writer.flush();
        writer.close();
        System.out.println("====");
    }
    
    
    
    /**
     * 
     * @描述:读取文件中信息到map中
     * @param path  文件路径  格式 "/pinyindb/gome_hanyu_pinyin_ext.txt"
     * @return void
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws IOException 
     */
    private static synchronized void readText(String path){
        if(StringUtils.isNotBlank(path)){
            BufferedReader reader = new BufferedReader(new InputStreamReader(ReadAndWriteTest.class.getResourceAsStream(path)));
            if(reader != null){
                String line=null;
                try {
                    while((line = reader.readLine())!=null){//读取一行到String中
                        String[] values = line.split(" ");
                        if(values.length != 2){
                            continue;
                        }
                        String unicode = values[0];
                        int code = Integer.parseInt(unicode, 16);
                        char ch = (char) code;
                        String pinyin  = values[1];
                        System.out.println("编码后的字符: " + unicode + "  对应的拼音:"+ pinyin);
                        System.out.println(ch);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }finally {
                    if(reader !=null){
                        try {
                            reader.close();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }
                }
            }
        }
        
    }
    
}


3.简单的向源码中添加类

从git上下载后,改写的源码地址

链接:http://pan.baidu.com/s/1i5zvYfz 密码:zodu


package com.gome.mx.plus.pinyin.ext;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

/**
 * 输入汉字获取对应拼音的工具类
 * @author songqinghu
 *
 */
public class PYReadUtils {

    /**
     * 
     * @描述:输入汉字获取对应的全拼  可能是多音字  返回为数组类型 ---如果该汉字查不到则返回null
     * @param words
     * @return
     * @return String[]
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws BadHanyuPinyinOutputFormatCombination 
     */
    public static String[] getFullPY(String words) throws BadHanyuPinyinOutputFormatCombination{
        
        StringBuffer buffer = new StringBuffer();
        
        HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
        
        defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
        defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        
        char[] chars = words.toCharArray();
        
        for (char c : chars) {
            if(c>128){//汉字
               String[] results  = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat);
                   
               for (int i = 0; i < results.length; i++) {
                buffer.append(results[i]);
                if(results.length - 1 != i){
                    buffer.append(",");
                 }
               }
               buffer.append(" ");
           }//不是汉字 --不处理 直接过滤掉
        }
        //所有汉字都变成了拼音  转换组合一下  将拼音拼凑起来
        return combination(buffer.toString());
    }
    //拼音封装去重复
    private static String[] combination(String all){
        ArrayList<Map<String, Integer>> list = new ArrayList<Map<String,Integer>>();
        
        String[] words = all.split(" ");//切为每个词
        for (String word : words) {
            String[] pys = word.split(",");//切出来每个词的每个拼音
            HashMap<String, Integer> map = new HashMap<String,Integer>();
            for (String py : pys) {//
                if(map.containsKey(py)){//去除重复拼音
                    Integer count = map.get(py);
                    map.put(py, count+1);
                }else{
                    map.put(py,1);
                }
            }
            list.add(map);//拼音顺序保持正确
        }
        //所有拼音处理完毕---进行拼凑
        return midMakeUp(list);
    }
    //组合拼音
    private static String[] midMakeUp(ArrayList<Map<String, Integer>> list){
        
        HashMap<String, Integer> firsts = null;
        
        for (Map<String, Integer> map : list) {
            
            HashMap<String, Integer> temp = new HashMap<String,Integer>();
            
            if(firsts !=null){//如果不是第一次--考虑组合问题
                for (String str : firsts.keySet()) {
                    for (String st : map.keySet()) {
                        temp.put(str + st, 1);//组合
                    }
                }
                
                if(temp != null && temp.size()>0){//清理容器  做容器转换
                    firsts.clear();
                }
                
            }else{//如果是第一次
                for (String str : map.keySet()) {
                    temp.put(str, 1);
                }
            }
            if(temp !=null && temp.size()>0){
                firsts = temp;
            }
        }
        //组合结束---调用方法转为string[] 
        
        return toStringArr(firsts);
    }
    
    private static String[] toStringArr(Map<String,Integer> map){
        if(map !=null && map.size()>0){
            String[] strs = new String[map.size()];
            Set<String> keySet = map.keySet();
            int i = 0;
            for (String key : keySet) {
                strs[i] = key;
                i++;
            }
            return strs;
        }
        return null;
    }
    
    
}


package com.gome.mx.plus.pinyin.ext;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import net.sourceforge.pinyin4j.ChineseToPinyinResource;
import net.sourceforge.pinyin4j.multipinyin.Trie;
/**
 * 将汉语和拼音写入指定的文件中--文件位置可以指定
 * 并且能够动态的加载  不需要重启服务
 * 还能指定是否重新写 还是追加的方式
 * 还能够将原来已经存在的拼音合并过来--可以指定
 * @author songqinghu
 *
 */
public class PYWriterUtils {

    private static String path  = "/pinyindb/gome_hanyu_pinyin_ext.txt";
    
    /**
     * @描述:获取配置文件的位置
     * @return void
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     */
    public static void setPath(String path){
        PYWriterUtils.path = path;
    }
    
    /**
     * 
     * @描述:默认写入的方式  设置为追加模式  合并已经存在的拼音为一个
     * @param word  汉字
     * @param pinyin 拼音
     * @param voice  声调
     * @return
     * @return boolean  是否成功
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws Exception 
     */
    public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{
        return writerControler(word, pinyin, voice, true, true);
    }
    /**
     * 
     * @描述:可以设置的写入方式  --这里还要增加一个批量写入的功能  本方法只是处理一个汉字
     * @param word  汉字
     * @param pinyin 拼音
     * @param voice  声调
     * @param additional 是否追加到文件后
     * @param merge 是否合并已经出现的拼音到文件中
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws Exception 
     * 龦
     */
    public static boolean writerControler(String word,String pinyin,Integer voice,
            boolean additional ,boolean merge) throws Exception{
        //添加音调
        pinyin = pinyin + voice;
        //配置文件地址
        String filePath = PYWriterUtils.class.getResource(path).getPath();
        //获取
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filePath, additional)));
        //写入--16进制  查询 --
        if(word !=null && word.length()>0){
            char c = word.toCharArray()[0];
            if(c>128){//是汉字
                String unicode = Integer.toHexString(c).toUpperCase();//编码
                if(merge){//如果要合并
                    Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();
                    String before = trie.get(unicode).getPinyin();
                    before = before.trim().substring(1, before.trim().length()-1);//去除()
                    pinyin = before +Field.COMMA+ pinyin;
                }
                pinyin = addSymbol(pinyin);
                writer.write(unicode+Field.SPACE+pinyin);
                writer.newLine();
                System.out.println(unicode+Field.SPACE+pinyin);
            }
        }
        writer.flush();
        writer.close();
        
        return true;
    }
    /**
     * 
     * @描述:当自定义文件需要更新时,调用方法 重新加载自己的配置文件
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws IOException 
     */
    public static boolean reloadText() throws IOException{
        
        InputStream is = PYWriterUtils.class.getResourceAsStream(path);
        
        ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is);
        
        return true;
    }
    
    
    
    /**
     * 添加操作符号
     */
    private static String addSymbol(String pinyin){
        return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET;
    }
    
    class Field {
        static final String LEFT_BRACKET = "(";

        static final String RIGHT_BRACKET = ")";

        static final String COMMA = ",";
        
        static final String SPACE = " ";
    }
}


package com.gome.mx.plus.pinyin.ext;

public enum Voice {

    One(1),Two(2),Three(3),Four(4);
    
    private  final Integer value;
    
    Voice(Integer value){
        this.value = value;
    }
    
    public Integer getValue(){
        return value;
    }
}

测试的类--第一次写入 但是不加载进map中 读取不到  加载后能读取到了

说明:默认自定义的文件地址为:path  = "/pinyindb/gome_hanyu_pinyin_ext.txt"  即和pinyin4j的字典在相同的目录下

package com.gome.mx.plus.pinyin.ext;

import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class MyTest {

    
    public static void main(String[] args) throws BadHanyuPinyinOutputFormatCombination {
//        String[] fullPY = PYReadUtils.getFullPY("龦");
//        for (String string : fullPY) {
//            System.out.println(string);
//        }
        try {
            //写入全新的字符到文件中
            PYWriterUtils.writerControler("骉", "test", Voice.Two.getValue(),true, true);
            String[] fullPY = PYReadUtils.getFullPY("骉");
            if(fullPY == null){
                System.out.println("没有查到");
            }else{
                System.out.println("查到");
                for (String string : fullPY) {
                    System.out.println(string);
                }
            }
            PYWriterUtils.reloadText();
            String[] full = PYReadUtils.getFullPY("骉");
            if(full == null){
                System.out.println("没有查到");
            }else{
                System.out.println("查到");
                for (String string : full) {
                    System.out.println(string);
                }
            }
            
            
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
}







你可能感兴趣的:(pinyin4j,Solr,源码改写,自定义添加汉字)