基于词典的逆向最大匹配中文分词算法，逆向分词比正向分词效果好

基于词典的逆向最大匹配中文分词算法，能实现中英文数字混合分词。比如能分出这样的词：bb霜、3室、乐phone、touch4、mp3、T恤。实际分词效果比正向分词效果好

查看第2 版：逆向最大匹配分词程序，能实现中英文数字混合分词 (第二版)

public class RMM

{
    private static final Log log = LogFactory.getLog(RMM.class);

    private static HashMap<String, Integer> dictionary = null;
    private static final int WORD_MAX_LENGTH = 9;

    static

{
        loadDictionary();
    }

    //将句子切分出词,逆向最大匹配
    public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException

{
        Collections.reverse(list);
        ArrayList<Token> tokenlist=new ArrayList<Token>();
        for(Sentence sen:list)

{
            StringBuffer word = new StringBuffer();
            int offset=sen.getStartOffset()+sen.getText().length;
            int bufferIndex = sen.getText().length-1;
            char c;
            boolean b=false;
            while(bufferIndex>-1)

{
                offset--;
                c=sen.getText()[bufferIndex--];
                if(word.length()==0)
                    word.append(c);
                else

{
                    String temp = (c+word.toString()).intern();
                    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                        word.insert(0, c);
                    else if(dictionary.containsKey(temp) && bufferIndex>-1)
                        word.insert(0, c);
                    else

{
                        bufferIndex++;
                        offset++;
                        while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)

{
                            word.deleteCharAt(0);
                            bufferIndex++;
                            offset++;
                        }
                        b=true;
                    }
                }
                if(b || bufferIndex==-1)

{
                    Token token = new Token(word.toString(),offset,offset+word.length(),"word");
                    word.setLength(0);
                    tokenlist.add(token);
                    b=false;
                }
            }
        }
        Collections.reverse(tokenlist);
        return tokenlist;
    }

    //加载词典
    public static void loadDictionary()

{
        if (dictionary == null)

{
            dictionary = new HashMap<String, Integer>();
            InputStream is = null;
            BufferedReader br = null;
            try

{
                is = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
                br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                String word = null;
                while ((word = br.readLine()) != null)

{
                    word=word.toLowerCase();
                    if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH))

{
                        dictionary.put(word.intern(), 1);
                        int i = 1;
                        while(i < word.length()-1)

{
                            String temp = word.substring(i,word.length()).intern();
                            if (!dictionary.containsKey(temp))
                                dictionary.put(temp,2);
                            i++;
                        }
                    }
                }
            }
            catch (Exception e)

{
                log.info(e);
            }
            finally

{
try

{
                    if(br!=null)
                        br.close();
                    if(is!=null)
                        is.close();
                }
                catch (IOException e)

{
                    log.info(e);
                }
            }
        }
    }

    public static String[] segWords(Reader reader)

{
        ArrayList<String> list=new ArrayList<String>();
        try

{
            ArrayList<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
            for(Token t:tlist)

{
                list.add(t.getWord());
            }
        }
        catch(IOException e)

{
            log.info(e);
        }
        return (String[])list.toArray(new String[0]);
    }

    public static void main(String[] args)

{
        String[] cc=RMM.segWords(new StringReader("急、急、急、花里林居,二房二厅,业主诚心,出租".toLowerCase()));
        for(String c:cc)

{
            System.out.println(c);
        }
    }
}

public class Util
{
//切分出由中文、字母、数字组成的句子
public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
{
  ArrayList<Sentence> list=new ArrayList<Sentence>();
  StringBuffer cb=new StringBuffer();
  int d=reader.read();
  int offset=0;
  boolean b=false;
  while(d>-1)
  {
   int type=Character.getType(d);
   if(type==2 || type==9 || type==5)
   {
    d=toAscii(d);
    cb.append((char)d);
   }
   else
   {
    b=true;
   }
   d=reader.read();
   if(d==-1 || b)
   {
    if(d==-1) offset++;
    b=false;
    char[] ioBuffer = new char[cb.length()];
    cb.getChars(0, cb.length(), ioBuffer, 0);
    Sentence sen=new Sentence(ioBuffer,offset-cb.length());
    list.add(sen);
    cb.setLength(0);
   }
   offset++;
  }
  return list;
}

//将相连的单个英文或数字组合成词
public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
  ArrayList<Token> tokenlist=new ArrayList<Token>();
  Token word=null;
  for(int i=0;i<list.size();i++)
  {
   Token t=list.get(i);
   if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
   {
    if(word==null)
     word=t;
    else if(word.getEnd()==t.getStart())
    {
     word.setEnd(t.getEnd());
     word.setWord(word.getWord()+t.getWord());
    }
    else
    {
     tokenlist.add(word);
     word=t;
    }
   }
   else if(word!=null)
   {
    tokenlist.add(word);
    word=null;
    tokenlist.add(t);
   }
   else
    tokenlist.add(t);
  }
  if(word!=null)
   tokenlist.add(word);
  return tokenlist;
}

//双角转单角
public static int toAscii(int codePoint)
{
  if((codePoint>=65296 && codePoint<=65305) //０-９
    || (codePoint>=65313 && codePoint<=65338) //Ａ-Ｚ
    || (codePoint>=65345 && codePoint<=65370) //ａ-ｚ
    )
  {
   codePoint -= 65248;
  }
  return codePoint;
}
}

基于词典的逆向最大匹配中文分词算法，逆向分词比正向分词效果好

你可能感兴趣的:(基于词典的逆向最大匹配中文分词算法，逆向分词比正向分词效果好)