自己理解三叉树TernarySearchTrie

花了差不多一天半的时间终于把一颗三叉树看完了,不过对于里面还有点疑惑,下面在代码里注释上了自己的理解,里面还存在一些疑问,欢迎理解的朋友们指出其中的错误,以及解答里面的疑问。

package org.apache.spell;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
/**
 *  三叉搜索树实现:字符串排序数据结构
 *  Implementation of a Ternary Search Trie, a data structure for storing <code>String</code> objects
 *  that combines the compact size of a binary search tree with the speed of a digital search trie, and is 
 *  therefore ideal for practical use in sorting and searching data.</p> <p>
 * 
 *  This data structure is faster than hashing for many typical search problems, and supports
 *  a broader range of useful problems and operations. Ternary searches are faster than
 *  hashing and more powerful, too.</p> <p>
 * 
 *  The theory of ternary search trees was described at a symposium in 1997 (see "Fast 
 *  Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. Sedgewick,
 *  Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete Algorithms, January 1997).
 *  Algorithms in C, Third Edition, by Robert Sedgewick (Addison-Wesley, 1998) provides 
 *  yet another view of ternary search trees. 
 * 
 * @author Bruno Martins
 *
 */

public class TernarySearchTrie {
  public static void main(String[] args) throws Exception {
  String triefile = "E://Java Projects//ses//src//test//lucene//dic//spell//trie.txt";
  TernarySearchTrie dictionary = new TernarySearchTrie( new File(triefile));
    //benchMark(args[0]);
   //benchMark(args[1]);
 }
  ////////////////////////////////////////////////////////////////
  // 三叉树的创建,节点创建,删除等
  ////////////////////////////////////////////////////////////////
  /**
  *  An inner class of Ternary Search Trie that represents a node in the trie.
  *  三叉树内部类代表树中的一个节点
  */

  public static final class TSTNode {
   /** Index values for accessing relatives array. */
   protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; //父,左,中,右(相对节点数组访问下标)
   /** The key to the node. */
   protected Object data;   //节点的值
   /** The relative nodes. */
   protected TSTNode[] relatives = new TSTNode[ 4];   //存放父,左,中,右4个相对节点
   /** The char used in the split. */
   protected char splitchar; //字符
   /**
   *  Constructor method.
   *
   *@param  splitchar  The char used in the split.  字符
   *@param  parent     The parent node. 父节点
   */

   protected TSTNode( char splitchar, TSTNode parent) {
    this.splitchar = splitchar;
   relatives[PARENT] = parent;
  }
   public String toString()
  {
    return String.valueOf(splitchar) + ":" +data;
  }
 }
 
  /**
  * 节点元素,仅仅保存当前节点值,即文件中存的一行数据:word:interger
  * key:word
  * data:interger
  * @author shentingting
  *
  */

  protected static class TSTItem {
   /** 节点的值. */
   protected Object data;
   /** 节点对应的key.即目标字符串 */
   protected String key;
   /**
   *  Constructor method.
   *
   *@param  key 当前节点索引的key.
   *@param  data 当前节点的数值.
   */

   protected TSTItem(String key, Object data) {
    this.key = key;
    this.data = data;
  }
 }
 
  /**
  *  Compares characters by alphabetical order.
  *  按字母顺序比较字符
  *@param  cCompare2  The first char in the comparison. 第一个字符
  *@param  cRef      The second char in the comparison. 第二个字符
  *@return           A negative number, 0 or a positive number if the second
  *      char is less, equal or greater.
  *      当第二个字符小于第一个字符 返回 负数
  *      当第二个字符等于第一个字符 返回 0
  *      当第二个字符大于第一个字符 返回 正数
  *      ASCII码对应值:
  *      A-Z 65-90 
  *      a-z 97-122
  * 其中忽略了a-x之间字符的大小写敏感度,经过下面的处理后其比较字符表从Ascii码转换成
  * A   a  B  b C c .... X    x  Y  Z    y  z
  * 65 66 67 68.........111 112 113 114 121 122
  * 至于为何YZyz四个字符没有作相同处理暂时还不知其原由
  */

  private static int compareCharsAlphabetically( int cCompare2, int cRef) {
   int cCompare = 0;
   if (cCompare2 > = 65) { //从A开始
    if (cCompare2 < 89) { //A-Y之间的字符(不包含Y)
    cCompare = ( 2 * cCompare2) - 65;
   } else if (cCompare2 < 97) { //在Y-a之间的字符(不包含a)
    cCompare = cCompare2 + 24;
   } else if (cCompare2 < 121) { //在a-y之间的字符(不包含y)
    cCompare = ( 2 * cCompare2) - 128;
   } else
    cCompare = cCompare2;
  } else //A之前的字符(不包含A)
   cCompare = cCompare2;
   if (cRef < 65) {
    return cCompare - cRef;  
  }
   if (cRef < 89) {
    return cCompare - (( 2 * cRef) - 65); 
  }
   if (cRef < 97) {
    return cCompare - (cRef + 24);
  }
   if (cRef < 121) {
    return cCompare - (( 2 * cRef) - 128);
  }
   return cCompare - cRef;
 }
  /**  The default number of values returned by the <code>matchAlmost</code> method. */
  private int defaultNumReturnValues = - 1;   //默认返回值
  /** the number of differences allowed in a call to the <code>matchAlmostKey</code> method. */
  private int matchAlmostDiff; //
  /** The base node in the trie. */
  private TSTNode rootNode;   //根节点
  /**
  *  Constructs an empty Ternary Search Trie.
  */

  public TernarySearchTrie() {
 }
  /**
  *  Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie. 
  *  The file is a normal text document, where each line is of the form
  *  word : integer.
  *
  *@param  file             The <code>File</code> with the data to load into the Trie.
  *@exception  IOException  A problem occured while reading the data.
  */

  public TernarySearchTrie(File file) throws IOException {
   this(file,false);
 }
 
  /**
  * 从文件中载入数据到字典树
  * 一个普通文本文档每行的格式:word : integer
  *  Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie. 
  *  The file is a normal text document, where each line is of the form " word : integer".
  *
  *@param  file              The <code>File</code> with the data to load into the Trie.
     *@param compression If true, the file is compressed with the GZIP algorithm, and if false, 
     *                                  the file is a normal text document.
     *                   true:文件根据GZIP算法压缩
     *                   false:普通的文本文档
  *@exception  IOException  A problem occured while reading the data.
  */

  public TernarySearchTrie(File file, boolean compression) throws IOException {
   this();
  BufferedReader in;
   //如果是压缩文件则通过建立解压缩输出流
   if(compression) in = new BufferedReader( new InputStreamReader( new GZIPInputStream( new FileInputStream(file))));
   else in = new BufferedReader( new InputStreamReader(( new FileInputStream(file))));
  String word;
   int pos;
   int occur;
   int numWords = 0;
   while ((word = in.readLine()) != null) { //abc:4
   numWords ++;
   pos = word.indexOf( ":");
   occur = 1;
    if (pos != - 1) {
    occur =
     ( new Integer(word.substring(pos + 1).trim())).intValue();
    word = word.substring( 0, pos);
   }
   String key = StringUtils.toLowerCase(word, false); //abc
    if (rootNode == null) {
    rootNode = new TSTNode(key.charAt( 0), null);
   }
   
    //从根节点比较获取key对应的节点
   TSTNode node = null;
    if (key.length() > 0 && rootNode != null) {
    TSTNode currentNode = rootNode;
     int charIndex = 0;
     while (true) {
      if (currentNode == null) //未找到对应的节点则跳出
       break;
      int charComp =
      compareCharsAlphabetically(
       key.charAt(charIndex),
       currentNode.splitchar);
      if (charComp == 0) {
      charIndex ++;
       if (charIndex == key.length()) { //找到key对应的节点
       node = currentNode;
        break;
      }
      currentNode = currentNode.relatives[TSTNode.EQKID];
     } else if (charComp < 0) { //左节点 
      currentNode = currentNode.relatives[TSTNode.LOKID];
     } else { //右节点
      currentNode = currentNode.relatives[TSTNode.HIKID];
     }
    }
    Integer occur2 = null;
     if (node != null)
     occur2 = ((Integer) (node.data));
     if (occur2 != null) { //如果存在多行key相同的数据,则把后面的数字累加
     occur += occur2.intValue();
    }
     //根据key获取一个TSTNode节点,当key对应的节点不存在则创建一个返回
    currentNode =
     getOrCreateNode(
      StringUtils.toLowerCase(word.trim(), false));
     //abc:4 则节点中data存的是4;当存在多行abc:*,则节点中data存的是"abc:"后面的数字之和
    currentNode.data = new Integer(occur); 
   }
  }
  in.close();
 }
  /**
  *  Deletes the node passed in as an argument. If this node
  *  has non-null data, then both the node and the data will be deleted. It also
  *  deletes any other nodes in the trie that are no longer needed after the
  *  deletion of the node.
  *
  *@param  nodeToDelete  The node to delete.
  */

  private void deleteNode(TSTNode nodeToDelete) {
   if (nodeToDelete == null) {
    return;
  }
  nodeToDelete.data = null; //设置当前节点的值为null
   while (nodeToDelete != null) {
   nodeToDelete = deleteNodeRecursion(nodeToDelete); //递归删除节点
    //deleteNodeRecursion(nodeToDelete);
  }
 }
  /**
  *  Recursivelly visits each node to be deleted.
  *  递归删除节点:
  *     还有一个疑问是当节点左右两边子树都不为null的情况下
  *  To delete a node, first set its data to null, then pass it into this method,
  *  then pass the node returned by this method into this method (make
  *  sure you don't delete the data of any of the nodes returned from this
  *  method!) and continue in this fashion until the node returned by this
  *  method is <code>null</code>.
  * 
  *  The TSTNode instance returned by this method will be next node to
  *  be operated on by <code>deleteNodeRecursion</code> (This emulates recursive 
  *  method call while avoiding the JVM overhead normally associated
  *  with a recursive method.)
  *
  *@param  currentNode  The node to delete.
  *@return   The next node to be called in deleteNodeRecursion.
  */

  private TSTNode deleteNodeRecursion(TSTNode currentNode) {
   if (currentNode == null) { return null; }
   // can't delete this node if it has a non-null eq kid or data
   //当前节点下面存在相等的非空节点,或当前节点存在数值不为null则说明存在其他key值对应该节点则不允许删除该节点
   if (currentNode.relatives[TSTNode.EQKID] != null || currentNode.data != null) {
    return null;
  }
  
  TSTNode currentParent = currentNode.relatives[TSTNode.PARENT]; //获取当前节点的父节点
   boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null; //判断当前节点左边是否为空
   boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null; //判断当前节点右边是否为空
   int childType;
   if (currentParent.relatives[TSTNode.LOKID] == currentNode) {
   childType = TSTNode.LOKID;
  } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) {
   childType = TSTNode.EQKID;
  } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) {
   childType = TSTNode.HIKID;
  } else {
   rootNode = null;
    return null;
  }
  
   //下面实现了删除当前节点的
   if (lokidNull && hikidNull) { //当前节点的左边为null,右边为null
   currentParent.relatives[childType] = null; //设置父节点指向的当前节点为null
    return currentParent; //返回父节点
  }
   if (lokidNull) { //当前节点的左边为null,右边不为null
   currentParent.relatives[childType] =
    currentNode.relatives[TSTNode.HIKID];
   currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] =
    currentParent;
    return currentParent; 
  }
   if (hikidNull) { //当前节点的右边为null,左边不为null,
   currentParent.relatives[childType] =
    currentNode.relatives[TSTNode.LOKID];
   currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] =
    currentParent;
    return currentParent;
  }
  
   //当前节点两边都不为null
   int deltaHi =
   currentNode.relatives[TSTNode.HIKID].splitchar
     - currentNode.splitchar;   //当前节点右边节点字符与它的字符的差值
   int deltaLo =
   currentNode.splitchar
     - currentNode.relatives[TSTNode.LOKID].splitchar;   //当前节点左边节点字符与它的字符的差值
   int movingKid;
  TSTNode targetNode;
   if (deltaHi == deltaLo) {
    if (Math.random() < 0. 5) {
    deltaHi ++;
   } else {
    deltaLo ++;
   }
  }
   if (deltaHi > deltaLo) {
   movingKid = TSTNode.HIKID;
   targetNode = currentNode.relatives[TSTNode.LOKID];
  } else {
   movingKid = TSTNode.LOKID;
   targetNode = currentNode.relatives[TSTNode.HIKID]; //
  }
   while (targetNode.relatives[movingKid] != null) {
   targetNode = targetNode.relatives[movingKid];
  }
  targetNode.relatives[movingKid] = currentNode.relatives[movingKid];
   //下面的指向语句一直让我不明白甚至开始怀疑自己是否真的看懂了三叉树构建的那个方法
   //不明白的是为何指向的是目标节点的最后一个节点,而其他节点全部被去掉了???(刚看了一下代码不知这是否和要求插入树的节点按顺序排列依次添加有关的原因,某一个节点左右分支有其特殊性,所以我的顾虑其实根本不是问题)
  currentParent.relatives[childType] = targetNode; 
  targetNode.relatives[TSTNode.PARENT] = currentParent;
   if ( !lokidNull) {
   currentNode.relatives[TSTNode.LOKID] = null;
  }
   if ( !hikidNull) {
   currentNode.relatives[TSTNode.HIKID] = null;
  }
   return currentParent;
 }
  /**
  *  Retrieve the object indexed by a key.
  *  返回key对应的节点的数值data
  *@param      key  A <code>String</code> index.
  *@return      The object retrieved from the Ternary Search Trie.
  */

  public Object get(String key) {
  TSTNode node = getNode(StringUtils.toLowerCase(key.trim(), false));
   if (node == null) { return null; }
   return node.data;
 }
  /**
  *  Retrieve the <code>Integer</code> indexed by key, increment it by one unit
  *  and store the new <code>Integer</code>.
  *  获取并且节点中的data值自动加1
  *@param  key  A <code>String</code> index.
  *@return   The <code>integer</code> retrieved from the Ternary Search Trie.
  */

  public Integer getAndIncrement(String key) {
  String key2 = StringUtils.toLowerCase(key.trim(), false);
  TSTNode node = getNode(key2);
   if (node == null) {
    return null;
  }
  Integer aux = (Integer) (node.data);
   if (aux == null) {
   aux = new Integer( 1);
  } else {
   aux = new Integer(aux.intValue() + 1);
  }
  put(key2, aux);
   return aux;
 }
  /**
  *  Returns the key that indexes the node argument.
  *  返回索引参数节点的key
  *@param  node  The node whose index is to be calculated.
  *@return  The <code>String</code> that indexes the node argument.
  *            a
  *            |   
  *            a    
  *            |     /   
  *            a(50)  b (12) 
  *             /     |
  *              b(6) a(8)
  *               /     /
  *                f(6)   c(11)
  * 从上面树从左到右叶子节点的key分别为:  
  *   aaa:50
  *   aab:6
  *   aaf:6
  *   ab:12
  *   aba:8
  *   abc:11  */

  protected String getKey(TSTNode node) {
  StringBuffer getKeyBuffer = new StringBuffer();
  getKeyBuffer.setLength( 0);
  getKeyBuffer.append( "" + node.splitchar);
  TSTNode currentNode;
  TSTNode lastNode;
  currentNode = node.relatives[TSTNode.PARENT]; //当前节点指向其父节点
  lastNode = node; //最后一个节点指向当前节点
   while (currentNode != null) {
    if (currentNode.relatives[TSTNode.EQKID] == lastNode) { //当前节点==最后指针指向的节点
    getKeyBuffer.append( "" + currentNode.splitchar);
   }
   lastNode = currentNode;
   currentNode = currentNode.relatives[TSTNode.PARENT];
  }
  getKeyBuffer.reverse(); //反转顺序
   return getKeyBuffer.toString();
 }
  /**
  *  Returns the node indexed by key, or <code>null</code> if that node doesn't exist.
  *  Search begins at root node.
  *  从根节点开始搜索key对应的节点node
  *@param  key  A <code>String</code> that indexes the node that is returned.
  *@return   The node object indexed by key. This object is an
  *      instance of an inner class named <code>TernarySearchTrie.TSTNode</code>.
  */

  public TSTNode getNode(String key) {
   return getNode(key, rootNode);
 }
  /**
  *  Returns the node indexed by key, or <code>null</code> if that node doesn't exist.
  *  The search begins at root node.
  *  从某节点开始获取key2对应的节点
  *@param  key2        A <code>String</code> that indexes the node that is returned.
  *@param  startNode  The top node defining the subtrie to be searched.
  *@return            The node object indexed by key. This object is
  *      an instance of an inner class named <code>TernarySearchTrie.TSTNode</code>.
  */

  protected TSTNode getNode(String key2, TSTNode startNode) {
  String key = StringUtils.toLowerCase(key2.trim(), false);
   if (key == null || startNode == null || key.length() == 0) {
    return null;
  }
  TSTNode currentNode = startNode;
   int charIndex = 0;
   while (true) {
    if (currentNode == null) {
     return null;
   }
    int charComp =
    compareCharsAlphabetically(
     key.charAt(charIndex),
     currentNode.splitchar);
    if (charComp == 0) {
    charIndex ++;
     if (charIndex == key.length()) {
      return currentNode;
    }
    currentNode = currentNode.relatives[TSTNode.EQKID];
   } else if (charComp < 0) {
    currentNode = currentNode.relatives[TSTNode.LOKID];
   } else {
    currentNode = currentNode.relatives[TSTNode.HIKID];
   }
  }
 }
  /**
  *  根据key获取一个TSTNode节点
  *      当key对应的节点不存在时则创建
  *  Returns the node indexed by key, creating that node if it doesn't exist,
  *  and creating any required intermediate nodes if they don't exist.
  *
  *@param  key                           A <code>String</code> that indexes the node that is returned.
  *@return                                  The node object indexed by key. This object is an
  *                                               instance of an inner class named <code>TernarySearchTrie.TSTNode</code>.
  *@exception  NullPointerException      If the key is <code>null</code>.
  *@exception  IllegalArgumentException  If the key is an empty <code>String</code>.
  */

  protected TSTNode getOrCreateNode(String key)
   throws NullPointerException, IllegalArgumentException {
   if (key == null) {
    throw new NullPointerException( "attempt to get or create node with null key");
  }
   if (key.length() == 0) {
    throw new IllegalArgumentException( "attempt to get or create node with key of zero length");
  }
   if (rootNode == null) {
   rootNode = new TSTNode(key.charAt( 0), null);
  }
  TSTNode currentNode = rootNode;
   int charIndex = 0;
   while (true) {
    int charComp =
    compareCharsAlphabetically(
     key.charAt(charIndex),
     currentNode.splitchar);
    if (charComp == 0) {
    charIndex ++;
     if (charIndex == key.length()) { //找到key对应的节点
      return currentNode;
    }
     if (currentNode.relatives[TSTNode.EQKID] == null) { //key对应的节点不存在则创建
     currentNode.relatives[TSTNode.EQKID] =
       new TSTNode(key.charAt(charIndex), currentNode);
    }
    currentNode = currentNode.relatives[TSTNode.EQKID]; //当前节点指向新创建的key对应的节点
   } else if (charComp < 0) {
     if (currentNode.relatives[TSTNode.LOKID] == null) {
     currentNode.relatives[TSTNode.LOKID] =
       new TSTNode(key.charAt(charIndex), currentNode);
    }
    currentNode = currentNode.relatives[TSTNode.LOKID];
   } else {
     if (currentNode.relatives[TSTNode.HIKID] == null) {
     currentNode.relatives[TSTNode.HIKID] =
       new TSTNode(key.charAt(charIndex), currentNode);
    }
    currentNode = currentNode.relatives[TSTNode.HIKID];
   }
  }
 }
 
  /**
  *  Removes the value indexed by key. Also removes all nodes that are rendered
  *  unnecessary by the removal of this data.
  *  删除一个节点
  *@param  key  A <code>string</code> that indexes the object to be removed from the Trie.
  */

  public void remove(String key) {
  deleteNode(getNode(StringUtils.toLowerCase(key.trim(), false)));
 }

  /**
  *  Stores a value in the trie. The value may be retrieved using the key.
  *  重新设置key对应节点的data值
  *@param  key    A <code>String</code> that indexes the object to be stored.
  *@param  value  The object to be stored in the Trie.
  */

  public void put(String key, Object value) {
  getOrCreateNode(StringUtils.toLowerCase(key.trim(), false)).data =
   value;
 }
   //////////////////////////////////////////////////////////
   // 计算树的节点数
   //////////////////////////////////////////////////////////
 
  /**
  *  Returns the number of nodes in the trie that have non-null data.
  *  返回data非空的节点数
  *@return    The number of nodes in the trie that have non-null data.
  */

  public int numDataNodes() {
   return numDataNodes(rootNode);
 }
  /**
  *  Returns the number of nodes in the subtrie below and including the
  *  starting node. The method counts only nodes that have non-null data.
  *  从某节点开始返回其子树里的data非空的节点数
  *@param  startingNode  The top node of the subtrie. the node that defines the subtrie.
  *@return               The total number of nodes in the subtrie.
  */

  protected int numDataNodes(TSTNode startingNode) {
   return recursiveNodeCalculator(startingNode, true, 0);
 }
  /**
  *  Returns the total number of nodes in the trie. The method counts nodes whether
  *  or not they have data.
  *  返回树中的所有节点,不管data是否为null
  *@return    The total number of nodes in the trie.
  */

  public int numNodes() {
   return numNodes(rootNode);
 }
  /**
  *  Returns the total number of nodes in the subtrie below and including the 
  *  starting Node. The method counts nodes whether or not they have data.
  *  从某节点开始返回其子树里所有节点数
  *@param  startingNode  The top node of the subtrie. The node that defines the subtrie.
  *@return               The total number of nodes in the subtrie.
  */

  protected int numNodes(TSTNode startingNode) {
   return recursiveNodeCalculator(startingNode, false, 0);
 }
  /**
  *  Recursivelly visists each node to calculate the number of nodes.
  *  计算节点数
  *@param  currentNode  The current node. 当前开始节点
  *@param  checkData    If true we check the data to be different of <code>null</code>. true:排除data值为null的节点
  *@param  numNodes2    The number of nodes so far.  到目前为止的计数,主要用于内部递归传参
  *@return              The number of nodes accounted.
  */

  private int recursiveNodeCalculator(
  TSTNode currentNode,
   boolean checkData,
   int numNodes2) {
   if (currentNode == null) {
    return numNodes2;
  }
   //左边树节点数
   int numNodes =
   recursiveNodeCalculator(
    currentNode.relatives[TSTNode.LOKID],
    checkData,
    numNodes2);
   //中间树节点数
  numNodes =
   recursiveNodeCalculator(
    currentNode.relatives[TSTNode.EQKID],
    checkData,
    numNodes);
   //右边树节点数
  numNodes =
   recursiveNodeCalculator(
    currentNode.relatives[TSTNode.HIKID],
    checkData,
    numNodes);
   //下面才是真正的计算代码
   if (checkData) {
    if (currentNode.data != null) {
    numNodes ++;
   }
  } else {
   numNodes ++;
  }
   return numNodes;
 }
  ///////////////////////////////////////////////////////
  // 获取相匹配节点列表的方法,(差距匹配、前缀匹配)
  ///////////////////////////////////////////////////////
  /**
  *  Returns a <code>List</code> of keys that almost match the argument key. Keys returned
  *  will have exactly diff characters that do not match the target key,
  *  where diff is equal to the last value passed in as an argument to the
  *  <code>setMatchAlmostDiff</code> method.
  * <p>
  *  If the <code>matchAlmost</code> method is called before the <code>setMatchAlmostDiff</code> method has
  * been called for the first time, then diff = 0.
  *
  *@param  key  The target key.
  *@return      A <code>List</code> with the results.
  */

  public List matchAlmost(String key) {
   return matchAlmost(key, defaultNumReturnValues);
 }
  /**
  *  Returns a <code>List</code> of keys that almost match the argument key. Keys returned
  *  will have exactly diff characters that do not match the target key,
  *  where diff is equal to the last value passed in as an argument to the
  *  <code>setMatchAlmostDiff</code> method.
  * <p>
  *  If the <code>matchAlmost</code> method is called before the <code>setMatchAlmostDiff</code> method has
  * been called for the first time, then diff = 0.
  *
  *@param  key              The target key.
  *@param  numReturnValues  The maximum number of values returned by this method.
  *@return                  A <code>List</code> with the results
  */

  protected List matchAlmost(String key, int numReturnValues) {
   return matchAlmostRecursion(
   rootNode,
    0,
   matchAlmostDiff,
   key,
   ((numReturnValues < 0) ? - 1 : numReturnValues),
    new Vector(),
   false);
 }

  /**
  *  Recursivelly vists the nodes in order to find the ones that almost match a given key.
  *  递归找出与给定的key相匹配的节点集合
  *@param  currentNode                 The current node. 当前节点 即递归开始节点
  *@param  charIndex                     The current char. 当前字符下标
  *@param  d                                 The number of differences so far. 到目前为止匹配到的不同数
  *@param  matchAlmostNumReturnValues  The maximum number of values in the result <code>List</code>. 匹配的最大结果个数
  *@param  matchAlmostResult2       The results so far. 目前为止匹配的结果列表
  *@param  upTo                             If true all keys having up to and including matchAlmostDiff
  *                                                   mismatched letters will be included in the result (including
  *                                                   a key that is exactly the same as the target string) otherwise
  *                                                   keys will be included in the result only if they have exactly
  *                                                   matchAlmostDiff number of mismatched letters.
  *                                                   true:包含与key完全相同的目标串,否则仅仅返回差距不同个数为matchAlmostDiff的匹配字符串
  *@param  matchAlmostKey           The key being searched. 被搜索的key
  *@return                                      A <code>List</code> with the results.
  */

  private List matchAlmostRecursion(
  TSTNode currentNode,
   int charIndex,
   int d,
  String matchAlmostKey,
   int matchAlmostNumReturnValues,
  List matchAlmostResult2,
   boolean upTo) {
   if ((currentNode == null) //到了叶子节点
    || (matchAlmostNumReturnValues != - 1
     && matchAlmostResult2.size() > = matchAlmostNumReturnValues) //matchAlmostNumReturnValues不等于-1并且大于最大返回结果数
    || (d < 0) //超出了给定的差距不同的个数
    || (charIndex > = matchAlmostKey.length())) { //长度超出了key的长度
    return matchAlmostResult2;
  }
   int charComp =
   compareCharsAlphabetically(
    matchAlmostKey.charAt(charIndex),
    currentNode.splitchar); //开始key取出一个字符与当前节点字符比较
  List matchAlmostResult = matchAlmostResult2;
   if ((d > 0) || (charComp < 0)) { //差距个数范围内或者比较字符小于当前节点字符
   matchAlmostResult =
    matchAlmostRecursion(
     currentNode.relatives[TSTNode.LOKID],    //左子树
     charIndex,
     d,
     matchAlmostKey,
     matchAlmostNumReturnValues,
     matchAlmostResult,
     upTo);
  }
   int nextD = (charComp == 0) ? d : d - 1;   //如果字符相同则d不变,否则d减1
   boolean cond = (upTo) ? (nextD > = 0) : (nextD == 0); //
   if ((matchAlmostKey.length() == charIndex + 1)
    && cond
    && (currentNode.data != null)) { //找到匹配的key
   matchAlmostResult.add(getKey(currentNode)); //获取当前节点的key放入集合中
  }
  matchAlmostResult =
   matchAlmostRecursion(
    currentNode.relatives[TSTNode.EQKID],   //中子树递归匹配
    charIndex + 1,
    nextD,
    matchAlmostKey,
    matchAlmostNumReturnValues,
    matchAlmostResult,
    upTo);
   if ((d > 0) || (charComp > 0)) { //右子树递归匹配
   matchAlmostResult =
    matchAlmostRecursion(
     currentNode.relatives[TSTNode.HIKID],
     charIndex,
     d,
     matchAlmostKey,
     matchAlmostNumReturnValues,
     matchAlmostResult,
     upTo);
  }
   return matchAlmostResult;
 }
  /**
  *  Returns an alphabetical <code>List</code> of all keys in the trie that begin with a given prefix.
  *  Only keys for nodes having non-null data are included in the <code>List</code>.
  *  返回前缀匹配的所有data为非null节点的key
  *@param  prefix  Each key returned from this method will begin with the characters in prefix.
  *@return         A <code>List</code> with the results.
  */

  public List matchPrefix(String prefix) {
   return matchPrefix(prefix, defaultNumReturnValues);
 }
  /**
  *  Returns an alphabetical <code>List</code> of all keys in the trie that begin with a
  *  given prefix. Only keys for nodes having non-null data are included in the <code>List</code>.
  *  返回前缀匹配的所有data为非null节点的key
  *@param  prefix           Each key returned from this method will begin with the characters in prefix.
  *@param  numReturnValues  The maximum number of values returned from this method.
  *@return                  A <code>List</code> with the results
  */

  public List matchPrefix(String prefix, int numReturnValues) {
  Vector sortKeysResult = new Vector();
  TSTNode startNode = getNode(prefix);
   if (startNode == null) {
    return sortKeysResult;
  }
   if (startNode.data != null) {
   sortKeysResult.addElement(getKey(startNode));
  }
   return sortKeysRecursion(
   startNode.relatives[TSTNode.EQKID],  
   ((numReturnValues < 0) ? - 1 : numReturnValues),
   sortKeysResult);
 }
  /**
  *  Sets the number of characters by which words can differ from target word
  *  when calling the <code>matchAlmost</code> method.
  * <p>
  *  Arguments less than 0 will set the char difference to 0, and arguments greater 
  *  than 3 will set the char difference to 3.
  *  匹配差距值0-3之间
  *@param  diff  The number of characters by which words can differ from target word.
  */

  public void setMatchAlmostDiff( int diff) {
   if (diff < 0) {
   matchAlmostDiff = 0;
  } else if (diff > 3) {
   matchAlmostDiff = 3;
  } else {
   matchAlmostDiff = diff;
  }
 }
 
  /**
  *  Sets the default maximum number of values returned from the <code>matchPrefix</code> and
  *  <code>matchAlmost</code> methods.
  *  <p>
  *  The value should be set this to -1 to get an unlimited number of return
  *  values. note that the methods mentioned above provide overloaded
  *  versions that allow you to specify the maximum number of return
  *  values, in which case this value is temporarily overridden.
  *
  **@param  num  The number of values that will be returned when calling the
  *                       methods above.
  */

  public void setNumReturnValues( int num) {
  defaultNumReturnValues = (num < 0) ? - 1 : num;
 }

  /**
  *  Returns keys sorted in alphabetical order. This includes the start Node and all
  *  nodes connected to the start Node. 
  *  <p>
  *  返回安装字母表排序的keys
  *  The number of keys returned is limited to numReturnValues. To get a list that
  *  isn't limited in size, set numReturnValues to -1.
  *  返回keys的个数不多于numReturnValues,但当numReturnValues=-1时则不受限制
  *@param  startNode        The top node defining the subtrie to be searched. 开始节点
  *@param  numReturnValues  The maximum number of values returned from this method. 返回个数
  *@return                  A <code>List</code> with the results.
  */

  protected List sortKeys(TSTNode startNode, int numReturnValues) {
   return sortKeysRecursion(
   startNode,
   ((numReturnValues < 0) ? - 1 : numReturnValues),
    new Vector());
 }
  /**
  *  Returns keys sorted in alphabetical order. This includes the current Node and all
  *  nodes connected to the current Node.
  *  <p>
  *  Sorted keys will be appended to the end of the resulting <code>List</code>. The result may be
  *  empty when this method is invoked, but may not be <code>null</code>.
  *
  *@param  currentNode              The current node.
  *@param  sortKeysNumReturnValues  The maximum number of values in the result. 返回最大个数
  *@param  sortKeysResult2           The results so far. 到目前为止已排序的结果
  *@return   A <code>List</code> with the results.
  */

  private List sortKeysRecursion(
  TSTNode currentNode,
   int sortKeysNumReturnValues,
  List sortKeysResult2) {
   if (currentNode == null) {
    return sortKeysResult2;
  }
   //下面从左节点开始递归,接着是中节点递归,最后右节点递归,这样就保证了key按字母顺序排序
   //从左节点开始递归
  List sortKeysResult =
   sortKeysRecursion(
    currentNode.relatives[TSTNode.LOKID],
    sortKeysNumReturnValues,
    sortKeysResult2);
  
   //已经达到返回个数,则返回结果
   if (sortKeysNumReturnValues != - 1
    && sortKeysResult.size() > = sortKeysNumReturnValues) {
    return sortKeysResult;
  }
   //节点顺序不为null则把当前节点的key和data实例化一个TSTItem元素插入到列表中
   if (currentNode.data != null) {
   sortKeysResult.add( new TSTItem(getKey(currentNode),currentNode.data));
  }
   //从中节点开始递归
  sortKeysResult =
   sortKeysRecursion(
    currentNode.relatives[TSTNode.EQKID],
    sortKeysNumReturnValues,
    sortKeysResult);
   //从右节点开始递归
   return sortKeysRecursion(
   currentNode.relatives[TSTNode.HIKID],
   sortKeysNumReturnValues,
   sortKeysResult);
 }
  /**
  * 三叉树迭代器
  * @return
  */

     public Enumeration keys() {
         return new Iterator();
    }
     public class Iterator implements Enumeration {
   private Stack stack;
   private TSTNode currentNode;
  
         public Iterator() {
    this.currentNode = null;
    this.stack = null;
        }
         public Object nextElement() {
          if (currentNode ==null)
           throw new java.util.NoSuchElementException( "out of range");
          return currentNode;
        }
         public boolean hasMoreElements() {
    // we are at the beginning
    if (stack ==null)
   {
    stack = new Stack();
    currentNode =null;
     if (rootNode !=null)
     stack.push(rootNode); //把根节点放入堆栈里
   }
    // we are at the end node, finished
    else if (currentNode ==null)
     return false;
   
    if (stack.size() == 0)
    currentNode =null;
   
    while (stack.size() > 0) //从根节点开始,依次从右节点,中间节点,左节点判断若不为null则加入堆栈中
   {
    currentNode = (TSTNode)stack.pop();    
     if (currentNode.relatives[TSTNode.HIKID] !=null)
     stack.push(currentNode.relatives[TSTNode.HIKID]);
     if (currentNode.relatives[TSTNode.EQKID] !=null)
     stack.push(currentNode.relatives[TSTNode.EQKID]);
     if (currentNode.relatives[TSTNode.LOKID] !=null)
     stack.push(currentNode.relatives[TSTNode.LOKID]);   
    
     //if (currentNode.IsKey)
     if (currentNode.data != null)
      break;
   }
     
    return currentNode != null;
        }
    }
     /////////////////////////////////////////////////////////////////
     // 根据三叉树生成字典写入文件
     /////////////////////////////////////////////////////////////////
     /**
     * Recursively insert the median first and then the median of the
     * lower and upper halves, and so on in order to get a balanced
     * tree. The array of keys is assumed to be sorted in ascending
     * order.
     *    递归首先插入中间然后插入上半部分,下半部分,最后得到一个平衡树
     *   keys数组假装是按升序排序
     * @param fp BufferedWriter
     * @param k  key列表
     * @param offset 开始偏移量
     * @param n 总记录数
     * @throws Exception
     */

     protected void outputBalanced(BufferedWriter  fp,List k, int offset, int n)
         throws Exception
 {
         int m;
         if (n < 1) { //总个数小于1
             return;
        }
        m = n >> 1; //右移
        
        TSTItem item = (TSTItem)(k.get(m + offset));
  
  fp.write(item.key + " : " +item.data);
  fp.write( '/n');
  
  outputBalanced(fp,k,offset, m);
  outputBalanced(fp,k, offset + m + 1, n - m - 1);
    }
    
     /**
     * Make balance result when output the tree.
     * 把树写入文件
     * @param sFilename
     * @throws Exception
     */

     public void save(String sFilename) throws Exception
 {
     BufferedWriter  fp = new BufferedWriter( new FileWriter(sFilename));
  
        List ret = sortKeys(rootNode, - 1); //从根节点开始获取所有data不为null的节点的key,并且所有key是按字母顺序排序
         int n = ret.size();
        
         //out put in balance order
        outputBalanced(fp,ret, 0, n);
        
  fp.close();
 }
     /////////////////////////////////////////////////////////////////
     // 从文件生成三叉树,并读取文件里的内容
     /////////////////////////////////////////////////////////////////
     /**
     * 
     * @param dicFile
     * @throws Exception
     */

  public static void benchMark(String dicFile) throws Exception{
  String word;
  TernarySearchTrie dic = 
    new TernarySearchTrie( new File(dicFile));
  
  System.out.println( "count nodes num:" +dic.numNodes());
   //dic.balance();
   //System.out.println("count nodes num:"+dic.numNodes());
  
  String sParagraph;
  ArrayList table = new ArrayList();
   BufferedReader fpSource;
   
      try{
         fpSource = new BufferedReader( new FileReader(dicFile));
         
       while( true )
      {
       sParagraph = fpSource.readLine();
        if (sParagraph == null )
         break;
       
       StringTokenizer st = new StringTokenizer(sParagraph, ":" );
       table.add(st.nextToken()); //加入所有key
      }
      fpSource.close();
     }
      catch(Exception e)
  {
      e.printStackTrace();
     }
     System.out.println( "begin test dic:" +dicFile + " table size:" +table.size());
     
   long start = System.currentTimeMillis();
   for ( int i = 0;i <table.size(); ++i)
  {
   word = (String)(table.get(i));
   dic.get(word); //获取key对应的data
    //System.out.println("test:"+word);
    //if (dic.get(word)== false)
    // System.out.println("test:"+word);
  }
   long end = System.currentTimeMillis();
  System.out.println( "time cost:" +(end - start));
  
 }
}
注意:通过调试代码发现原来自己一直有一个理解的误区,就是中节点TSTNode.EQKID下标存的值一直认为是和其父节点字符相同的节点。但其实这是一种错误的理解。看下面的代码,其实它比较的字符与创建的字符不是同一个,它是先创建后比较:
int charComp =
    compareCharsAlphabetically(
     key.charAt( charIndex),
     currentNode.splitchar);
    if (charComp == 0) {
     charIndex++;
     if (charIndex == key.length()) { //找到key对应的节点
      return currentNode;
    }
     if (currentNode.relatives[TSTNode.EQKID] == null) { //key对应的节点不存在则创建
     currentNode.relatives[TSTNode.EQKID] =
       new TSTNode(key.charAt( charIndex), currentNode);
    }
    currentNode = currentNode.relatives[TSTNode.EQKID]; //当前节点指向新创建的key对应的节点
对于其中的疑问已发帖子在:http://topic.csdn.net/u/20101204/21/31ee4e48-6447-4d94-b55e-d422c24b6fc4.html

你可能感兴趣的:(自己理解三叉树TernarySearchTrie)