第一次写中文分词程序

public   class  MM2 
{
    
private   static   final  Log log  =  LogFactory.getLog(MM2. class );
    
    
private   static  HashMap < String, Integer >  dictionary  =   null
    
private   static   final   int  WORD_MAX_LENGTH  =   9 ;
    
private  Reader reader;
    
    
static
    
{
        loadDictionary();
    }

    
    
public  MM2(Reader reader) 
    

        
this .reader  =  reader; 
    }
 
    
    
// 切分出由中文、字母、数字组成的句子
     public  ArrayList < Sentence >  getSentence()  throws  IOException
    
{   
        ArrayList
< Sentence >  list = new  ArrayList < Sentence > ();
        StringBuffer cb
= new  StringBuffer();
        
int  d = reader.read();
        
int  offset = 0 ;
        
boolean  b = false ;
        
while (d >- 1 )
        
{
            
int  type = Character.getType(d);
            
if (type == 2   ||  type == 9   ||  type == 5 )
            
{
                d
= toAscii(d);
                cb.append((
char )d);
            }

            
else
            
{
                b
= true ;
            }

            d
= reader.read();
            
if (d ==- 1   ||  b)
            
{
                
if (d ==- 1 ) offset ++ ;
                b
= false ;
                
char [] ioBuffer  =   new   char [cb.length()];
                cb.getChars(
0 , cb.length(), ioBuffer,  0 );
                Sentence sen
= new  Sentence(ioBuffer,offset - cb.length());
                list.add(sen);
                cb.setLength(
0 );
            }

            offset
++ ;
        }

        
return  list;
    }

    
    
// 将句子切分出词
     public  ArrayList < Token >  getToken(ArrayList < Sentence >  list)  throws  IOException
    
{
        ArrayList
< Token >  tokenlist = new  ArrayList < Token > ();
        
for (Sentence sen:list)
        
{
            StringBuffer word 
=   new  StringBuffer();
            
int  offset = sen.getStartOffset();
            
int  bufferIndex  =   0 ;
            
char  c;
            
boolean  b = false ;
            
while (bufferIndex < sen.getText().length)
            
{
                offset
++ ;
                c
= sen.getText()[bufferIndex ++ ];
                
if (word.length() == 0 )
                    word.append(c);
                
else
                
{
                    String temp 
=  (word.toString()  +  c).intern();
                    
if (dictionary.containsKey(temp)  &&  dictionary.get(temp) == 1 )
                        word.append(c);
                    
else   if (dictionary.containsKey(temp)  &&  bufferIndex < sen.getText().length)
                        word.append(c);
                    
else
                    
{
                        bufferIndex
-- ;
                        offset
-- ;
                        
while (word.length() > 1   &&  dictionary.get(word.toString()) != null   &&  dictionary.get(word.toString()) == 2 )
                        
{
                            word.deleteCharAt(word.length()
- 1 );
                            bufferIndex
-- ;
                            offset
-- ;
                        }

                        b
= true ;
                    }

                }

                
if (b  ||  bufferIndex == sen.getText().length)
                
{
                    Token token 
=   new  Token(word.toString(),offset - word.length(),offset, " word " );
                    word.setLength(
0 );
                    tokenlist.add(token);
                    b
= false ;
                }

            }

        }

        
return  tokenlist;
    }

    
    
// 将相连的单个英文或数字组合成词
     public  ArrayList < Token >  getNewToken(ArrayList < Token >  list)  throws  IOException
    
{
        ArrayList
< Token >  tokenlist = new  ArrayList < Token > ();
        Token word
= null ;
        
for ( int  i = 0 ;i < list.size();i ++ )
        
{
            Token t
= list.get(i);
            
if (t.getWord().length() == 1   &&  Character.getType(( int )t.getWord().charAt( 0 )) != 5 )
            
{
                
if (word == null )
                    word
= t;
                
else   if (word.getEnd() == t.getStart())
                
{
                    word.setEnd(t.getEnd());
                    word.setWord(word.getWord()
+ t.getWord());
                }

                
else
                
{
                    tokenlist.add(word);
                    word
= t;
                }

            }

            
else   if (word != null )
            
{
                tokenlist.add(word);
                word
= null ;
                tokenlist.add(t);
            }

            
else
                tokenlist.add(t);
        }

        
if (word != null )
            tokenlist.add(word);
        
return  tokenlist;
    }

    
    
// 双角转单角
     public   static   int  toAscii( int  codePoint) 
    
{
        
if ((codePoint >= 65296   &&  codePoint <= 65305 )     // 0-9
                 ||  (codePoint >= 65313   &&  codePoint <= 65338 )     // A-Z
                 ||  (codePoint >= 65345   &&  codePoint <= 65370 )     // a-z
                )
        
{    
            codePoint 
-=   65248 ;
        }

        
return  codePoint;
    }

    
    
// 加载词典
     public   static   void  loadDictionary() 
    
{  
        
if  (dictionary  ==   null
        
{    
            dictionary 
=   new  HashMap < String, Integer > ();    
            InputStream is 
=   null ;    
            BufferedReader br 
=   null ;            
            
try
            
{
                is 
=   new  FileInputStream( new  File(MM2. class .getClassLoader().getResource( " dictionary.txt " ).toURI()));
                br 
=   new  BufferedReader( new  InputStreamReader(is,  " UTF-8 " ));
                String word 
=   null ;
                
while  ((word  =  br.readLine())  !=   null
                
{
                    word
= word.toLowerCase();
                    
if  ((word.indexOf( " # " ==   - 1 &&  (word.length()  <=  WORD_MAX_LENGTH))
                    
{
                        dictionary.put(word.intern(), 
1 );    
                        
int  i  =  word.length() - 1
                        
while (i  >=   2 )
                        
{
                            String temp 
=  word.substring( 0 , i).intern(); 
                            
if  ( ! dictionary.containsKey(temp))
                                dictionary.put(temp,
2 ); 
                            i
-- ;
                        }

                    }

                }

            }

            
catch  (Exception e) 
            
{      
                log.info(e);
            }

            
finally
            
{
                
try  
                
{      
                    
if (br != null )
                        br.close();   
                    
if (is != null )
                        is.close();  
                }

                
catch  (IOException e)
                
{     
                    log.info(e);
                }
            
            }
 
        }
 
    }

    
    
public   static  String[] segWords(Reader input)
    
{
        ArrayList
< String >  list = new  ArrayList < String > ();
        
try
        
{
            MM2 f
= new  MM2(input);
            ArrayList
< Token >  tlist =  f.getNewToken(f.getToken(f.getSentence()));
            
for (Token t:tlist)
            
{
                list.add(t.getWord());
            }

        }

        
catch (IOException e)
        
{
            log.info(e);
        }

        
return  (String[])list.toArray( new  String[ 0 ]);
    }

    
    
public   static   void  main(String[] args) 
    
{
        String[] cc
= MM2.segWords( new  StringReader( " ibm商务机t60p " .toLowerCase()));
        
for (String c:cc)
        
{
            System.out.println(c);
        }

    }

}

你可能感兴趣的:(中文分词)