Java课程设计-文档相似性检查系统-字符串中文分词类

突然说起中文分词,真是无从下手。这东西涉及中文,肯定会需要引入其它类包吧。查查资料是需要Lucene的开源全文检索引擎工具包。在Lucene里面就有中文分词器IKAnalyzer  Analyzer 3.0 中文分词器  

需要的JAR如下

lucene-analyzers-2.4.1.jar  下载

lucene-core-2.4.1.jar  下载

IKAnalyzer2.0.20BF.jar 下载

import java.io.Reader; 
import java.io.StringReader; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.StopFilter; 
import org.apache.lucene.analysis.Token; 
import org.apache.lucene.analysis.TokenFilter; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.cjk.CJKAnalyzer; 
import org.apache.lucene.analysis.cn.ChineseAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.mira.lucene.analysis.MIK_CAnalyzer;


public class JeAnalyzer 
{ 

	private static String testString1 = "我喜欢看电视视频,不喜欢看电影。"; 
	public static void testStandard(String testString) 
	{
		try
		{
			Analyzer analyzer = new StandardAnalyzer(); 
			Reader r = new StringReader(testString); 
			StopFilter sf = (StopFilter) analyzer.tokenStream("", r); 
			System.err.println("=====standard analyzer===="); 
			Token t; 
			while ((t = sf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}
	public static void testCJK(String testString) 
	{ 
		try
		{
			Analyzer analyzer = new CJKAnalyzer(); 
			Reader r = new StringReader(testString); 
			StopFilter sf = (StopFilter) analyzer.tokenStream("", r); 
			System.err.println("=====cjk analyzer===="); 
			Token t; 
			while ((t = sf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	} 
	public static void testChiniese(String testString) 
	{ 
		try
		{
			Analyzer analyzer = new ChineseAnalyzer(); 
			Reader r = new StringReader(testString); 
			TokenFilter tf = (TokenFilter) analyzer.tokenStream("", r); 
			System.err.println("=====chinese analyzer===="); 
			Token t; 
			while ((t = tf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	} 

	public static String transJe(String testString,String c1,String c2) 
	{
		String result = "";
		try 
		{
			Analyzer analyzer = new MIK_CAnalyzer(); 
			Reader r = new StringReader(testString); 
			TokenStream ts = (TokenStream)analyzer.tokenStream("", r); 
			//System.out.println("=====je analyzer===="); 
			Token t;
			while ((t = ts.next()) != null) 
			{
				result += t.termText()+",";
			}
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		return result;
	} 
	public static void main(String[] args) 
	{ 
		try
		{

			String testString = testString1; 
			System.out.println(testString); 
			String sResult[] = transJe(testString,"gb2312","utf-8").split(","); 
			for(int i = 0 ; i< sResult.length ; i++)
			{
				System.out.println(sResult[i]);
			}
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}

} 


你可能感兴趣的:(java,编程,代码)