1、WhitespaceAnalyzer
仅仅是去除空格,对字符没有lowcase化,不支持中文;
并且不对生成的词汇单元进行其他的规范化处理。
2、SimpleAnalyzer
功能强于WhitespaceAnalyzer, 首先会通过非字母字符来分割文本信息,然后将词汇单元统一为小写形式。该分析器会去掉数字类型的字符。
3、StopAnalyzer
StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上增加了去除英文中的常用单词(如the,a等),也可以更加自己的需要设置常用单词;不支持中文
4、StandardAnalyzer
英文的处理能力同于StopAnalyzer.支持中文采用的方法为单字切分。他会将词汇单元转换成小写形式,并去除停用词和标点符号。
public class AnalyzerDemo { /**WhitespaceAnalyzer分析器*/ public void whitespaceAnalyzer(String msg){ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_36); this.getTokens(analyzer, msg); } /**SimpleAnalyzer分析器*/ public void simpleAnalyzer(String msg){ SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36); this.getTokens(analyzer, msg); } /**StopAnalyzer分析器*/ public void stopAnalyzer(String msg){ StopAnalyzer analyzer = new StopAnalyzer(Version.LUCENE_36); this.getTokens(analyzer, msg); } /**StandardAnalyzer分析器*/ public void standardAnalyzer(String msg){ StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); this.getTokens(analyzer, msg); } private void getTokens(Analyzer analyzer,String msg){ TokenStream tokenStream=analyzer.tokenStream("content", new StringReader(msg)); this.printTokens(analyzer.getClass().getSimpleName(),tokenStream); } private void printTokens(String analyzerType,TokenStream tokenStream){ CharTermAttribute ta = tokenStream.addAttribute(CharTermAttribute.class); StringBuffer result =new StringBuffer(); try { while(tokenStream.incrementToken()){ if(result.length()>0){ result.append(","); } result.append("["+ta.toString()+"]"); } } catch (IOException e) { e.printStackTrace(); } System.out.println(analyzerType+"->"+result.toString()); } }
测试
public class TestAnalyizer {
private TokenizerDemo demo = null; //private String msg = "我喜欢你,我的祖国!china 中国"; private String msg = "I love you, China!B2C"; @Before public void setUp() throws Exception { demo=new TokenizerDemo(); } @Test public void testWhitespaceAnalyzer(){ demo.whitespaceAnalyzer(msg); } @Test public void testSimpleAnalyzer(){ demo.simpleAnalyzer(msg); } @Test public void testStopAnalyzer(){ demo.stopAnalyzer(msg); } @Test public void testStandardAnalyzer(){ demo.standardAnalyzer(msg); } }