中文字典树(简单而不稳定版)

做一个简单的中文字典树,稍比英文的复杂一些,英文无非26个树杈而且排序方便,用数组表示树杈即可

而中文相对复杂一些,一个中文2个字节一个char就够了,但是树杈个数不定,对树杈进行排序用数组来做也相对复杂,因此考虑用二叉搜索树代替数组来做子节点的存储结构

一个字典树节点链接一个二叉搜索树,二叉树上每个支上又挂载一个字典树节点

部分代码如下:

package com.cyj.analysis;

/**
 * 汉字字典树
 * 
 * @author Administrator
 * 
 */
public class Trie {
	// 子节点
	private BSTree children;
	// 该节点存储的数据
	private Character nodeValue;
	// 是否词尾 0非词尾 1词尾
	private int nodeState = 0;

	public Trie(Character value) {
		if (value == null)
			throw new IllegalArgumentException("参数异常不能为空");
		this.nodeValue = value;
	}

	/**
	 * 匹配词段
	 * 
	 * @param charArray
	 * @return Hit
	 */

	public Hit match(char[] charArray, int begin, int length, Hit searchHit) {
		if (searchHit == null) {
			searchHit = new Hit();
			searchHit.setBegin(begin);
		} else {
			searchHit.setUnmatch();
		}
		searchHit.setEnd(begin);
		Character keyChar = new Character(charArray[begin]);
		Trie ds = null;
		BSTree segmentArray = this.children;
		if (segmentArray != null) {
			Trie keySegment = new Trie(keyChar);
			BSTree bs = null;
			if ((bs = segmentArray.findNode(segmentArray, keySegment)) != null)
				ds = bs.getValue();

		}
		if (ds != null) {
			if (length > 1) {
				// 字符未匹配完
				return ds.match(charArray, begin + 1, length - 1, searchHit);
			} else if (length == 1) {
				if (ds.nodeState == 1) {
					searchHit.setMatch();
				}
				if (ds.children != null) {
					searchHit.setPrefix();
					searchHit.setMatchedDictSegment(ds);
				}
				return searchHit;
			}
		}
		return searchHit;
	}

	/**
	 * 添加
	 */
	public synchronized void fillSegment(char[] charArray, int begin, int length) {
		Character beginChar = charArray[begin];
		Trie trie = new Trie(beginChar);
		if (length == 1) {
			trie.nodeState = 1;
		}
		BSTree bs =  children;
		if (children == null) {
			children = new BSTree(trie);
		}
		BSTree node = children.findNode(children, trie);
		if (node == null) {
			children.addNode(children, trie);
			node = children.findNode(children, trie);
		}
		if (length > 1) {
			node.getValue().fillSegment(charArray, begin + 1, length - 1);
		}

	}

	public BSTree getChildren() {
		return children;
	}

	public Character getNodeValue() {
		return nodeValue;
	}

	public int getNodeState() {
		return nodeState;
	}

}


package com.cyj.analysis;

public class BSTree {
	private BSTree left;
	private BSTree right;
	private int freqs;
	private Trie value;
	
	public BSTree(Trie value){
		this.value = value;
		this.left = null;
		this.right = null;
		this.freqs = 1;
	}
	
	/**
	 * 添加节点
	 * @param src
	 * @param value
	 * @return
	 */
	public BSTree addNode(BSTree src,Trie value){
		if(src == null){
			src = new BSTree(value);
		}else if(src.value.getNodeValue() > value.getNodeValue()){
			src.left = addNode(src.left, value);
		}else if(src.value.getNodeValue() < value.getNodeValue()){
			src.right = addNode(src.right, value);
		}else
			src.freqs += 1;
		balence();
		return src;
	}
	
	/**
	 * 查找节点
	 * @param src
	 * @param value
	 * @return
	 */
	public BSTree findNode(BSTree src,Trie value){
		if(src == null){
			return null;
		}
		BSTree node = null;
		if(value.getNodeValue() < src.value.getNodeValue()){
			node = findNode(src.left,value);
		}else if(value.getNodeValue() > src.value.getNodeValue()){
			node = findNode(src.right,value);
		}else
			node = src;
		return node;
	}

	public BSTree getLeft() {
		return left;
	}

	public BSTree getRight() {
		return right;
	}

	public int getFreqs() {
		return freqs;
	}

	public Trie getValue() {
		return value;
	}
	
	/**
	 * 平衡方法,暂不考虑
	 */
	private void balence(){
		
	}
}
测试类

package com.cyj.analysis.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import com.cyj.analysis.Hit;
import com.cyj.analysis.Trie;


public class Main {
	//测试字典,已通过
	public static void main(String[] args) throws IOException {
		BufferedReader br = new BufferedReader(new FileReader(new File("C:\\Users\\Administrator\\Desktop\\sougou.txt")));
		String content = null;
		Trie trie = new Trie('0');
		while((content = br.readLine())!=null){
			content = new String(content.getBytes("GBK"),"UTF-8");
			String words = content.split("[\t ]+")[0];
			trie.fillSegment(words.toCharArray(), 0, words.length());
		}
		br.close();
		Hit searchHit = new Hit();
		searchHit = trie.match("阿猫".toCharArray(), 0, 2, searchHit);
		System.out.println(searchHit.isMatch());
	}
}


你可能感兴趣的:(闲谈)