中文分词

//词典
	private Map<String, String> map = new HashMap<String, String>();
	
	//最大词组
	private final static int MAX_PHRASE = 5;
	
	public static void main(String[] args) throws Exception
	{
		StrUtil su = new StrUtil();
		su.initWordMap();
		su.cent("“标准”输出流。此流已打开并准备接受输出数据。通常,此流对应于显示器输出或者由主机环境或用户指定的另一个输出目标。");
	}
	
	/**
	 * 分词
	 * @param msg
	 */
	public void cent(String msg)
	{
		String ret = "";
		char[] c = msg.toCharArray();
		int length = c.length;
		go : for(int i = 0; i < length; i++)
		{
			//组成词组,并查看字典中是否包含该词组(词组最大值为MAX_PHRASE)
			String tmp = "";
			for(int j = 0; j < MAX_PHRASE; j++)
			{
				int ij = i + j;
				tmp += c[ij < length ? ij : length - 1];
				
				//查看字典中是否包含该词组
				if(map.containsKey(tmp))
				{
					ret += tmp + " ";
					i = i + j;
					continue go;
				}
			}
			
			ret += c[i] + " ";
		}
		
		System.out.println(ret);
	}
	
	/**
	 * 初始化字典
	 * @throws IOException 
	 * @throws Exception
	 */
	private void initWordMap() throws IOException
	{
		File file = new File("c:/a.txt");
		FileInputStream in = new FileInputStream(file);
		byte[] b = new byte[in.available()];
		in.read(b);
		String str = new String(b);
		String[] word = str.split("\r\n");
		for(int i = 0; i < word.length; i++)
		{
			map.put(word[i], null);
		}
	}

你可能感兴趣的:(C++,c,C#,J#,Go)