public class MM2
{
private static final Log log = LogFactory.getLog(MM2. class );
private static HashMap < String, Integer > dictionary = null ;
private static final int WORD_MAX_LENGTH = 9 ;
private Reader reader;
static
{
loadDictionary();
}
public MM2(Reader reader)
{
this .reader = reader;
}
// 切分出由中文、字母、数字组成的句子
public ArrayList < Sentence > getSentence() throws IOException
{
ArrayList < Sentence > list = new ArrayList < Sentence > ();
StringBuffer cb = new StringBuffer();
int d = reader.read();
int offset = 0 ;
boolean b = false ;
while (d >- 1 )
{
int type = Character.getType(d);
if (type == 2 || type == 9 || type == 5 )
{
d = toAscii(d);
cb.append(( char )d);
}
else
{
b = true ;
}
d = reader.read();
if (d ==- 1 || b)
{
if (d ==- 1 ) offset ++ ;
b = false ;
char [] ioBuffer = new char [cb.length()];
cb.getChars( 0 , cb.length(), ioBuffer, 0 );
Sentence sen = new Sentence(ioBuffer,offset - cb.length());
list.add(sen);
cb.setLength( 0 );
}
offset ++ ;
}
return list;
}
// 将句子切分出词
public ArrayList < Token > getToken(ArrayList < Sentence > list) throws IOException
{
ArrayList < Token > tokenlist = new ArrayList < Token > ();
for (Sentence sen:list)
{
StringBuffer word = new StringBuffer();
int offset = sen.getStartOffset();
int bufferIndex = 0 ;
char c;
boolean b = false ;
while (bufferIndex < sen.getText().length)
{
offset ++ ;
c = sen.getText()[bufferIndex ++ ];
if (word.length() == 0 )
word.append(c);
else
{
String temp = (word.toString() + c).intern();
if (dictionary.containsKey(temp) && dictionary.get(temp) == 1 )
word.append(c);
else if (dictionary.containsKey(temp) && bufferIndex < sen.getText().length)
word.append(c);
else
{
bufferIndex -- ;
offset -- ;
while (word.length() > 1 && dictionary.get(word.toString()) != null && dictionary.get(word.toString()) == 2 )
{
word.deleteCharAt(word.length() - 1 );
bufferIndex -- ;
offset -- ;
}
b = true ;
}
}
if (b || bufferIndex == sen.getText().length)
{
Token token = new Token(word.toString(),offset - word.length(),offset, " word " );
word.setLength( 0 );
tokenlist.add(token);
b = false ;
}
}
}
return tokenlist;
}
// 将相连的单个英文或数字组合成词
public ArrayList < Token > getNewToken(ArrayList < Token > list) throws IOException
{
ArrayList < Token > tokenlist = new ArrayList < Token > ();
Token word = null ;
for ( int i = 0 ;i < list.size();i ++ )
{
Token t = list.get(i);
if (t.getWord().length() == 1 && Character.getType(( int )t.getWord().charAt( 0 )) != 5 )
{
if (word == null )
word = t;
else if (word.getEnd() == t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord() + t.getWord());
}
else
{
tokenlist.add(word);
word = t;
}
}
else if (word != null )
{
tokenlist.add(word);
word = null ;
tokenlist.add(t);
}
else
tokenlist.add(t);
}
if (word != null )
tokenlist.add(word);
return tokenlist;
}
// 双角转单角
public static int toAscii( int codePoint)
{
if ((codePoint >= 65296 && codePoint <= 65305 ) // 0-9
|| (codePoint >= 65313 && codePoint <= 65338 ) // A-Z
|| (codePoint >= 65345 && codePoint <= 65370 ) // a-z
)
{
codePoint -= 65248 ;
}
return codePoint;
}
// 加载词典
public static void loadDictionary()
{
if (dictionary == null )
{
dictionary = new HashMap < String, Integer > ();
InputStream is = null ;
BufferedReader br = null ;
try
{
is = new FileInputStream( new File(MM2. class .getClassLoader().getResource( " dictionary.txt " ).toURI()));
br = new BufferedReader( new InputStreamReader(is, " UTF-8 " ));
String word = null ;
while ((word = br.readLine()) != null )
{
word = word.toLowerCase();
if ((word.indexOf( " # " ) == - 1 ) && (word.length() <= WORD_MAX_LENGTH))
{
dictionary.put(word.intern(), 1 );
int i = word.length() - 1 ;
while (i >= 2 )
{
String temp = word.substring( 0 , i).intern();
if ( ! dictionary.containsKey(temp))
dictionary.put(temp, 2 );
i -- ;
}
}
}
}
catch (Exception e)
{
log.info(e);
}
finally
{
try
{
if (br != null )
br.close();
if (is != null )
is.close();
}
catch (IOException e)
{
log.info(e);
}
}
}
}
public static String[] segWords(Reader input)
{
ArrayList < String > list = new ArrayList < String > ();
try
{
MM2 f = new MM2(input);
ArrayList < Token > tlist = f.getNewToken(f.getToken(f.getSentence()));
for (Token t:tlist)
{
list.add(t.getWord());
}
}
catch (IOException e)
{
log.info(e);
}
return (String[])list.toArray( new String[ 0 ]);
}
public static void main(String[] args)
{
String[] cc = MM2.segWords( new StringReader( " ibm商务机t60p " .toLowerCase()));
for (String c:cc)
{
System.out.println(c);
}
}
}