solr入门之pinyin4j源码改写动态添加扩展词及整合进war项目中

1.初始化时加载用户定义的字典
package net.sourceforge.pinyin4j;

import net.sourceforge.pinyin4j.multipinyin.Trie;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import com.gome.mx.plus.pinyin.ext.PYWriterUtils;

/**
 * Manage all external resources required in PinyinHelper class.
 *
 * @author Li Min ([email protected])
 */
public class ChineseToPinyinResource {
    /**
     * A hash table contains <Unicode, HanyuPinyin> pairs
     */
    private Trie unicodeToHanyuPinyinTable = null;

    /**
     * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.
     */
    private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {
        this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;
    }

    /**
     * @return Returns the unicodeToHanyuPinyinTable.
     */
   public  Trie getUnicodeToHanyuPinyinTable() {
        return unicodeToHanyuPinyinTable;
    }

    /**
     * Private constructor as part of the singleton pattern.
     */
    private ChineseToPinyinResource() {
        initializeResource();
    }

    /**
     * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
     */
    private void initializeResource() {
        try {
            final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
            final String resourceMultiName = "/pinyindb/multi_pinyin.txt";
            final String userResourceName  = PYWriterUtils.getPath();

            setUnicodeToHanyuPinyinTable(new Trie());
            getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));

            getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName));

            getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();
            //加载用户自定义词库
            if (userResourceName != null) {
                File userMultiPinyinFile = new File(userResourceName);
                FileInputStream is = new FileInputStream(userMultiPinyinFile);
                getUnicodeToHanyuPinyinTable().load(is);
            }

        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    Trie getHanyuPinyinTrie(char ch) {

        String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();

        // fetch from hashtable
        return getUnicodeToHanyuPinyinTable().get(codepointHexStr);
    }

    /**
     * Get the unformatted Hanyu Pinyin representations of the given Chinese
     * character in array format.
     *
     * @param ch given Chinese character in Unicode
     * @return The Hanyu Pinyin strings of the given Chinese character in array
     * format; return null if there is no corresponding Pinyin string.
     */
    String[] getHanyuPinyinStringArray(char ch) {
        String pinyinRecord = getHanyuPinyinRecordFromChar(ch);
        return parsePinyinString(pinyinRecord);
    }

    String[] parsePinyinString(String pinyinRecord) {

        if (null != pinyinRecord) {
            int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);
            int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);

            String stripedString =
                    pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),
                            indexOfRightBracket);

            return stripedString.split(Field.COMMA);

        } else
            return null; // no record found or mal-formatted record
    }

    /**
     * @param record given record string of Hanyu Pinyin
     * @return return true if record is not null and record is not "none0" and
     * record is not mal-formatted, else return false
     */
    private boolean isValidRecord(String record) {
        final String noneStr = "(none0)";

        return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)
                && record.endsWith(Field.RIGHT_BRACKET);
    }

    /**
     * @param ch given Chinese character in Unicode
     * @return corresponding Hanyu Pinyin Record in Properties file; null if no
     * record found
     */
    private String getHanyuPinyinRecordFromChar(char ch) {
        // convert Chinese character to code point (integer)
        // please refer to http://www.unicode.org/glossary/#code_point
        // Another reference: http://en.wikipedia.org/wiki/Unicode
        int codePointOfChar = ch;

        String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();

        // fetch from hashtable
        Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);
        String foundRecord = null;
        if (trie != null)
            foundRecord = trie.getPinyin();

        return isValidRecord(foundRecord) ? foundRecord : null;
    }

    /**
     * Singleton factory method.
     *
     * @return the one and only MySingleton.
     */
    public static ChineseToPinyinResource getInstance() {

        return ChineseToPinyinResourceHolder.theInstance;
    }


    /**
     * Singleton implementation helper.
     */
    private static class ChineseToPinyinResourceHolder {
        static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource();
    }

    /**
     * A class encloses common string constants used in Properties files
     *
     * @author Li Min ([email protected])
     */
    class Field {
        static final String LEFT_BRACKET = "(";

        static final String RIGHT_BRACKET = ")";

        static final String COMMA = ",";
    }
}



批量写入功能添加

package com.gome.mx.plus.pinyin.ext;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import javax.xml.crypto.dsig.spec.ExcC14NParameterSpec;

import net.sourceforge.pinyin4j.ChineseToPinyinResource;
import net.sourceforge.pinyin4j.ResourceHelper;
import net.sourceforge.pinyin4j.multipinyin.MultiPinyinConfig;
import net.sourceforge.pinyin4j.multipinyin.Trie;
/**
 * 将汉语和拼音写入指定的文件中--文件位置可以指定
 * 并且能够动态的加载  不需要重启服务
 * 还能指定是否重新写 还是追加的方式
 * 还能够将原来已经存在的拼音合并过来--可以指定
 * @author songqinghu
 *
 */
public class PYWriterUtils {

    //这里改为系统的绝对路径
    private static String path;

    private static boolean flag = true;//可以设置文件位置
    /**
     * @描述:获取配置文件的位置 ---只能设置一次
     * @return void
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     */
    public static void setPath(String path){
        if(flag){
            PYWriterUtils.path = path;
            flag = false;//只能设置 一次
        }
    }

    public static String getPath(){
        return PYWriterUtils.path;
    }

    private static Class pathClass = PYWriterUtils.class;


    /**
     * 
     * @描述:默认写入的方式  设置为追加模式  合并已经存在的拼音为一个
     * @param word  汉字
     * @param pinyin 拼音
     * @param voice  声调
     * @return
     * @return boolean  是否成功
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws Exception 
     */
    public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{
        return writerControler(word, pinyin, voice, true, true);
    }
    /**
     * 
     * @描述:可以设置的写入方式  --这里还要增加一个批量写入的功能  本方法只是处理一个汉字
     * @param word  汉字
     * @param pinyin 拼音
     * @param voice  声调
     * @param additional 是否追加到文件后
     * @param merge 是否合并已经出现的拼音到文件中
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws Exception 
     * 龦
     */
    public static boolean writerControler(String word,String pinyin,Integer voice,
            boolean additional ,boolean merge) throws Exception{

        String path = PYWriterUtils.path;
        if (path != null) {
            File userMultiPinyinFile = new File(path);
            if (userMultiPinyinFile.exists()) {
                //获取
                BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional)));
                //添加音调
                pinyin = pinyin + voice;
                //写入--16进制  查询 --
                if(word !=null && word.length()>0){
                    char c = word.toCharArray()[0];
                    if(c>128){//是汉字
                        String unicode = Integer.toHexString(c).toUpperCase();//编码
                        if(merge){//如果要合并
                            Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();

                            if(trie.get(unicode)!=null){ //存在了编码和拼音对应关系---这里最好在判断一次是否存在了该拼音
                                String before = trie.get(unicode).getPinyin();
                                before = before.trim().substring(1, before.trim().length()-1);//去除()
                                //存在了 就不添加进去了
                                boolean flag = false;
                                String[] words = before.split(",");
                                for (String str : words) {
                                    if(str.equals(pinyin)){
                                        flag = true; //存在该拼音
                                        break;
                                    }
                                }
                               if(flag){
                                   pinyin = before;
                               }else{
                                   pinyin = before +Field.COMMA+ pinyin ;
                               }
                            }
                            //不存在  不需要改变pinyin
                        }
                        pinyin = addSymbol(pinyin);
                        writer.write(unicode+Field.SPACE+pinyin);
                        writer.newLine();
                    }
                }
                writer.flush();
                writer.close();
                //写入完成  更新词库
                reloadText();
                return true;
            }
        }else{
            throw new Exception("找不到用户扩展字典");
        }
       return false;
    }

    /**
     * 完成批量添加的功能
     */
    /**
     * 
     * @描述:批量添加汉字和拼音的映射关系到自定义词库中----这里有个问题 当 批量输入一个多音字 拼音都是map中同一个key时只能提交成功一个--建议提交两次
     * @param contents  汉字  拼音  音调  这里一个汉字  可以输入多个拼音了
     * @param additional 是否追加到文件后
     * @param merge 是否合并已经出现的拼音到文件中
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    public static boolean writerBatch(Map<String,Map<String,Integer>> contents,boolean additional ,boolean merge){
        //加载文件部分
        BufferedWriter writer =null;
        try {
            if (path != null) {
                File userMultiPinyinFile = new File(path);
                if (userMultiPinyinFile.exists()) {
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional)));
            //写入处理部分
            Set<Entry<String, Map<String, Integer>>> entrySet = contents.entrySet();
            for (Entry<String, Map<String, Integer>> entry : entrySet) {
                String word = entry.getKey().trim();//汉语
                String pinyin = "";
                for (Entry<String, Integer> content : entry.getValue().entrySet()) {
                    String py = content.getKey().trim();
                    Integer voice = content.getValue();
                    pinyin = pinyin + py + voice+",";
                }
                //拼音添加结束  去除最后一个,
                pinyin = pinyin.substring(0, pinyin.length()-1);
                //汉字和拼音都已经处理完毕 进入单个词语写入模块 --方法 抽取出来公用
                String line = midWriter(word, pinyin, merge);
                if(line != null){
                    writer.write(line);
                    writer.newLine();
                }
            }
            writer.flush();
            return true;
                }
           }else{
               throw new  Exception("请配置用户词典绝对路径");
           }
        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            try {
                if(writer!=null)
                   writer.close();
                PYWriterUtils.reloadText();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return false;
    }
    /**
     * 
     * @描述:方法抽取--对单个字进行处理
     * @return
     * @return String 组合后的一行要写入的   形式    E4A3 (ang3,yi1,wang3)
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    private static String midWriter(String word ,String pinyin,boolean merge){

        if(word !=null && word.length()>0){
            char c = word.toCharArray()[0];
            if(c>128){//如果是汉字
               String unicode  = Integer.toHexString(c).toUpperCase();//变为16进制
               if(merge){//如果要合并 需要先取出来  在合并  取不到还要处理一下
                   //获取到总的资源池
                   Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();
                   //如果存在该词语的拼音
                   if(trie.get(unicode)!=null &&trie.get(unicode).getPinyin()!=null){
                       String before = trie.get(unicode).getPinyin();
                       //对已经处在字符串进行处理 --(xxx) (xxxx,xxxx) 
                       before = before.trim().substring(1, before.trim().length()-1);//去除()
                       //如果存在了  就不再重复添加了
                       String[] splits = before.split(",");
                       String[] strings = pinyin.trim().split(",");
                       Set<String> temp  = new HashSet<String>();
                       //去重复
                       for (String split : splits) {
                           temp.add(split.trim());
                       }
                       for (String string : strings) {
                          temp.add(string);
                       }
                       pinyin ="";
                       for (String tem : temp) {
                         pinyin = pinyin + tem+Field.COMMA;
                       }
                       pinyin =  pinyin.substring(0,pinyin.length()-1);//去除最后一个,
                   }
                   //不存在 直接 保持拼音不变
               }
               //组合成写入的格式
               pinyin = addSymbol(pinyin);

               return unicode + Field.SPACE+pinyin;
            }
        }
        return null;
    }

    /**
     * 
     * @描述:默认批量写入功能
     * @param contents
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    public static boolean defaultWriterBatch(Map<String,Map<String,Integer>> contents){

        return writerBatch(contents, true, true);
    }

    /**
     * 
     * @描述:当自定义文件需要更新时,调用方法 重新加载自己的配置文件
     * @return
     * @return boolean
     * @exception
     * @createTime:2016年4月6日
     * @author: songqinghu
     * @throws IOException 
     */
    public static boolean reloadText() throws IOException{

        if (path != null) {
            File userMultiPinyinFile = new File(path);
            FileInputStream is = new FileInputStream(userMultiPinyinFile);
            if(is !=null){
              ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is);
              return true;
            }
        }
        return false;
    }



    /**
     * 添加操作符号
     */
    private static String addSymbol(String pinyin){
        return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET;
    }

    class Field {
        static final String LEFT_BRACKET = "(";

        static final String RIGHT_BRACKET = ")";

        static final String COMMA = ",";

        static final String SPACE = " ";
    }
}


将jar和原有suggest工程进行整合

出现问题---无法写入jar中自定义文件(jar中的文件只能读取)
===>解决思路 将用户自定义词典放在运行的war工程中

需要手动指定一次文件位置---大概功能已经可以整合进入项目中使用了

package cn.com.mx.gome.suggest.controller;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;

import com.gome.mx.plus.pinyin.ext.PYReadUtils;
import com.gome.mx.plus.pinyin.ext.PYWriterUtils;

import cn.com.mx.gome.search.core.common.ResultData;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

/**
 * 
 * @author songqinghu
 * 对pinyin4j中的用户自定义词典库进行操作
 */
@Controller
@RequestMapping("/suggest/pinyin")
public class PinYinController {

    private  Logger logger = LoggerFactory.getLogger(PinYinController.class);



    /**
     * 
     * @描述:获取指定汉字的现存词库中的拼音 --需要POST请求
     * @return
     * @return ResultData<String>
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    @RequestMapping("/getpy")
    @ResponseBody
    public ResultData<List<String>> getPinYin(String word){
        ResultData<List<String>> result = new ResultData<List<String>>();
        try {
            if(word != null && word.trim().length()>0){
                String[] fullPY = PYReadUtils.getFullPY(word);
                if(fullPY!=null && fullPY.length>0){
                    ArrayList<String> list = new ArrayList<String>();
                    for (String string : fullPY) {
                        list.add(string);
                    }
                    result.setData(list);
                    result.setSuccess(true);
                    return result;
                }

            }

        } catch (BadHanyuPinyinOutputFormatCombination e) {
           logger.error("",e);
        }
        result.setSuccess(false);
        return result;
    }
    /**
     * 
     * @描述:添加一个汉字的映射关系到用户自定库中
     * @param word
     * @param pinyin
     * @param voice
     * @return
     * @return ResultData<String>
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    @RequestMapping("/addpy")
    @ResponseBody
    public ResultData<Boolean> addPinYin(String word,String pinyin,Integer voice){
        ResultData<Boolean> result = new ResultData<Boolean>();
        if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){
            try {
                boolean flag = PYWriterUtils.dufaultWriter(word, pinyin, voice);
                result.setData(flag);
                result.setSuccess(true);
                return result;
            } catch (Exception e) {
                e.printStackTrace();
                logger.error("",e);
            }           
        }
        result.setSuccess(false);
        return result;
    }

    @RequestMapping("/test")
    @ResponseBody
    public ResultData<Boolean> addtest(String word,String pinyin,Integer voice){
        Map<String, Map<String, Integer>> contents = new HashMap<String,Map<String,Integer>>();

        HashMap<String, Integer> content = new HashMap<String,Integer>();

        content.put("test", 1);
        content.put("tttt", 2);
        content.put("ling", 1);
        contents.put("〇", content);
 //       setDicPath();
        ResultData<Boolean> result = new ResultData<Boolean>();
        if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){
            try {
                boolean flag = PYWriterUtils.defaultWriterBatch(contents);
                result.setData(flag);
                result.setSuccess(true);
                return result;
            } catch (Exception e) {
                e.printStackTrace();
                logger.error("",e);
            }           
        }
        result.setSuccess(false);
        return result;
    }



}



war工程使用  SSM架构 项目启动时加载词库所在位置工具类

package cn.com.mx.gome.suggest.component;
/**
 * 项目启动时加载指定的pinyin4j用户扩展字典
 * @author songqinghu
 *
 */

import javax.annotation.PostConstruct;

import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import com.gome.mx.plus.pinyin.ext.PYWriterUtils;

import cn.com.mx.gome.suggest.controller.PinYinController;

@Component
public class PinYinDataSourceFile {

    @Value("${PINYIN_FILE_PATH}")
    private String path;
    /**
     * 
     * @描述:项目启动时 此类加载完成后执行此方法完成用户自定义pinyin4j字典配置的加载
     * @return void
     * @exception
     * @createTime:2016年4月7日
     * @author: songqinghu
     */
    @PostConstruct
    private void setFilePath(){
        String pathFile = PinYinDataSourceFile.class.getResource(path).getPath();
        PYWriterUtils.setPath(pathFile);
    }


}

最后附上 改写后的pinyin4j源码
链接:http://pan.baidu.com/s/1skUD8dv 密码:fhy4


你可能感兴趣的:(pinyin4j,Solr,源码改写,动态添加拼音,加载外部资源文件)