基于词典的逆向最大匹配中文分词算法,逆向分词比正向分词效果好
基于词典的逆向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤。实际分词效果比正向分词效果好
查看第2 版: 逆向最大匹配分词程序,能实现中英文数字混合分词 (第二版)
public
class
RMM
{
private static final Log log = LogFactory.getLog(RMM.class);
private static HashMap<String, Integer> dictionary = null;
private static final int WORD_MAX_LENGTH = 9;
static
{
loadDictionary();
}
//将句子切分出词,逆向最大匹配
public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
{
Collections.reverse(list);
ArrayList<Token> tokenlist=new ArrayList<Token>();
for(Sentence sen:list)
{
StringBuffer word = new StringBuffer();
int offset=sen.getStartOffset()+sen.getText().length;
int bufferIndex = sen.getText().length-1;
char c;
boolean b=false;
while(bufferIndex>-1)
{
offset--;
c=sen.getText()[bufferIndex--];
if(word.length()==0)
word.append(c);
else
{
String temp = (c+word.toString()).intern();
if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
word.insert(0, c);
else if(dictionary.containsKey(temp) && bufferIndex>-1)
word.insert(0, c);
else
{
bufferIndex++;
offset++;
while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
{
word.deleteCharAt(0);
bufferIndex++;
offset++;
}
b=true;
}
}
if(b || bufferIndex==-1)
{
Token token = new Token(word.toString(),offset,offset+word.length(),"word");
word.setLength(0);
tokenlist.add(token);
b=false;
}
}
}
Collections.reverse(tokenlist);
return tokenlist;
}
//加载词典
public static void loadDictionary()
{
if (dictionary == null)
{
dictionary = new HashMap<String, Integer>();
InputStream is = null;
BufferedReader br = null;
try
{
is = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String word = null;
while ((word = br.readLine()) != null)
{
word=word.toLowerCase();
if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH))
{
dictionary.put(word.intern(), 1);
int i = 1;
while(i < word.length()-1)
{
String temp = word.substring(i,word.length()).intern();
if (!dictionary.containsKey(temp))
dictionary.put(temp,2);
i++;
}
}
}
}
catch (Exception e)
{
log.info(e);
}
finally
{
try
{
if(br!=null)
br.close();
if(is!=null)
is.close();
}
catch (IOException e)
{
log.info(e);
}
}
}
}
public static String[] segWords(Reader reader)
{
ArrayList<String> list=new ArrayList<String>();
try
{
ArrayList<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
for(Token t:tlist)
{
list.add(t.getWord());
}
}
catch(IOException e)
{
log.info(e);
}
return (String[])list.toArray(new String[0]);
}
public static void main(String[] args)
{
String[] cc=RMM.segWords(new StringReader("急、急、急、花里林居,二房二厅,业主诚心,出租".toLowerCase()));
for(String c:cc)
{
System.out.println(c);
}
}
}
public class Util
{
//切分出由中文、字母、数字组成的句子
public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
{
ArrayList<Sentence> list=new ArrayList<Sentence>();
StringBuffer cb=new StringBuffer();
int d=reader.read();
int offset=0;
boolean b=false;
while(d>-1)
{
int type=Character.getType(d);
if(type==2 || type==9 || type==5)
{
d=toAscii(d);
cb.append((char)d);
}
else
{
b=true;
}
d=reader.read();
if(d==-1 || b)
{
if(d==-1) offset++;
b=false;
char[] ioBuffer = new char[cb.length()];
cb.getChars(0, cb.length(), ioBuffer, 0);
Sentence sen=new Sentence(ioBuffer,offset-cb.length());
list.add(sen);
cb.setLength(0);
}
offset++;
}
return list;
}
//将相连的单个英文或数字组合成词
public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
ArrayList<Token> tokenlist=new ArrayList<Token>();
Token word=null;
for(int i=0;i<list.size();i++)
{
Token t=list.get(i);
if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
{
if(word==null)
word=t;
else if(word.getEnd()==t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord()+t.getWord());
}
else
{
tokenlist.add(word);
word=t;
}
}
else if(word!=null)
{
tokenlist.add(word);
word=null;
tokenlist.add(t);
}
else
tokenlist.add(t);
}
if(word!=null)
tokenlist.add(word);
return tokenlist;
}
//双角转单角
public static int toAscii(int codePoint)
{
if((codePoint>=65296 && codePoint<=65305) //0-9
|| (codePoint>=65313 && codePoint<=65338) //A-Z
|| (codePoint>=65345 && codePoint<=65370) //a-z
)
{
codePoint -= 65248;
}
return codePoint;
}
}
{
private static final Log log = LogFactory.getLog(RMM.class);
private static HashMap<String, Integer> dictionary = null;
private static final int WORD_MAX_LENGTH = 9;
static
{
loadDictionary();
}
//将句子切分出词,逆向最大匹配
public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
{
Collections.reverse(list);
ArrayList<Token> tokenlist=new ArrayList<Token>();
for(Sentence sen:list)
{
StringBuffer word = new StringBuffer();
int offset=sen.getStartOffset()+sen.getText().length;
int bufferIndex = sen.getText().length-1;
char c;
boolean b=false;
while(bufferIndex>-1)
{
offset--;
c=sen.getText()[bufferIndex--];
if(word.length()==0)
word.append(c);
else
{
String temp = (c+word.toString()).intern();
if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
word.insert(0, c);
else if(dictionary.containsKey(temp) && bufferIndex>-1)
word.insert(0, c);
else
{
bufferIndex++;
offset++;
while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
{
word.deleteCharAt(0);
bufferIndex++;
offset++;
}
b=true;
}
}
if(b || bufferIndex==-1)
{
Token token = new Token(word.toString(),offset,offset+word.length(),"word");
word.setLength(0);
tokenlist.add(token);
b=false;
}
}
}
Collections.reverse(tokenlist);
return tokenlist;
}
//加载词典
public static void loadDictionary()
{
if (dictionary == null)
{
dictionary = new HashMap<String, Integer>();
InputStream is = null;
BufferedReader br = null;
try
{
is = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String word = null;
while ((word = br.readLine()) != null)
{
word=word.toLowerCase();
if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH))
{
dictionary.put(word.intern(), 1);
int i = 1;
while(i < word.length()-1)
{
String temp = word.substring(i,word.length()).intern();
if (!dictionary.containsKey(temp))
dictionary.put(temp,2);
i++;
}
}
}
}
catch (Exception e)
{
log.info(e);
}
finally
{
try
{
if(br!=null)
br.close();
if(is!=null)
is.close();
}
catch (IOException e)
{
log.info(e);
}
}
}
}
public static String[] segWords(Reader reader)
{
ArrayList<String> list=new ArrayList<String>();
try
{
ArrayList<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
for(Token t:tlist)
{
list.add(t.getWord());
}
}
catch(IOException e)
{
log.info(e);
}
return (String[])list.toArray(new String[0]);
}
public static void main(String[] args)
{
String[] cc=RMM.segWords(new StringReader("急、急、急、花里林居,二房二厅,业主诚心,出租".toLowerCase()));
for(String c:cc)
{
System.out.println(c);
}
}
}
public class Util
{
//切分出由中文、字母、数字组成的句子
public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
{
ArrayList<Sentence> list=new ArrayList<Sentence>();
StringBuffer cb=new StringBuffer();
int d=reader.read();
int offset=0;
boolean b=false;
while(d>-1)
{
int type=Character.getType(d);
if(type==2 || type==9 || type==5)
{
d=toAscii(d);
cb.append((char)d);
}
else
{
b=true;
}
d=reader.read();
if(d==-1 || b)
{
if(d==-1) offset++;
b=false;
char[] ioBuffer = new char[cb.length()];
cb.getChars(0, cb.length(), ioBuffer, 0);
Sentence sen=new Sentence(ioBuffer,offset-cb.length());
list.add(sen);
cb.setLength(0);
}
offset++;
}
return list;
}
//将相连的单个英文或数字组合成词
public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
ArrayList<Token> tokenlist=new ArrayList<Token>();
Token word=null;
for(int i=0;i<list.size();i++)
{
Token t=list.get(i);
if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
{
if(word==null)
word=t;
else if(word.getEnd()==t.getStart())
{
word.setEnd(t.getEnd());
word.setWord(word.getWord()+t.getWord());
}
else
{
tokenlist.add(word);
word=t;
}
}
else if(word!=null)
{
tokenlist.add(word);
word=null;
tokenlist.add(t);
}
else
tokenlist.add(t);
}
if(word!=null)
tokenlist.add(word);
return tokenlist;
}
//双角转单角
public static int toAscii(int codePoint)
{
if((codePoint>=65296 && codePoint<=65305) //0-9
|| (codePoint>=65313 && codePoint<=65338) //A-Z
|| (codePoint>=65345 && codePoint<=65370) //a-z
)
{
codePoint -= 65248;
}
return codePoint;
}
}