花了差不多一天半的时间终于把一颗三叉树看完了,不过对于里面还有点疑惑,下面在代码里注释上了自己的理解,里面还存在一些疑问,欢迎理解的朋友们指出其中的错误,以及解答里面的疑问。
package org.apache.spell;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
/**
* 三叉搜索树实现:字符串排序数据结构
* Implementation of a Ternary Search Trie, a data structure for storing String
objects
* that combines the compact size of a binary search tree with the speed of a digital search trie, and is
* therefore ideal for practical use in sorting and searching data.
*
* This data structure is faster than hashing for many typical search problems, and supports
* a broader range of useful problems and operations. Ternary searches are faster than
* hashing and more powerful, too.
*
* The theory of ternary search trees was described at a symposium in 1997 (see "Fast
* Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. Sedgewick,
* Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete Algorithms, January 1997).
* Algorithms in C, Third Edition, by Robert Sedgewick (Addison-Wesley, 1998) provides
* yet another view of ternary search trees.
*
* @author Bruno Martins
*
*/
public
class TernarySearchTrie {
public
static
void main(String[] args)
throws Exception {
String triefile
=
"E://Java Projects//ses//src//test//lucene//dic//spell//trie.txt";
TernarySearchTrie dictionary
=
new TernarySearchTrie(
new File(triefile));
//benchMark(args[0]);
//benchMark(args[1]);
}
////////////////////////////////////////////////////////////////
// 三叉树的创建,节点创建,删除等
////////////////////////////////////////////////////////////////
/**
* An inner class of Ternary Search Trie that represents a node in the trie.
* 三叉树内部类代表树中的一个节点
*/
public
static
final
class TSTNode {
/** Index values for accessing relatives array. */
protected
final
static
int PARENT
=
0, LOKID
=
1, EQKID
=
2, HIKID
=
3;
//父,左,中,右(相对节点数组访问下标)
/** The key to the node. */
protected Object data;
//节点的值
/** The relative nodes. */
protected TSTNode[] relatives
=
new TSTNode[
4];
//存放父,左,中,右4个相对节点
/** The char used in the split. */
protected
char splitchar;
//字符
/**
* Constructor method.
*
*@param splitchar The char used in the split. 字符
*@param parent The parent node. 父节点
*/
protected TSTNode(
char splitchar, TSTNode parent) {
this.splitchar
= splitchar;
relatives[PARENT]
= parent;
}
public String toString()
{
return String.valueOf(splitchar)
+
":"
+data;
}
}
/**
* 节点元素,仅仅保存当前节点值,即文件中存的一行数据:word:interger
* key:word
* data:interger
* @author shentingting
*
*/
protected
static
class TSTItem {
/** 节点的值. */
protected Object data;
/** 节点对应的key.即目标字符串 */
protected String key;
/**
* Constructor method.
*
*@param key 当前节点索引的key.
*@param data 当前节点的数值.
*/
protected TSTItem(String key, Object data) {
this.key
= key;
this.data
= data;
}
}
/**
* Compares characters by alphabetical order.
* 按字母顺序比较字符
*@param cCompare2 The first char in the comparison. 第一个字符
*@param cRef The second char in the comparison. 第二个字符
*@return A negative number, 0 or a positive number if the second
* char is less, equal or greater.
* 当第二个字符小于第一个字符 返回 负数
* 当第二个字符等于第一个字符 返回 0
* 当第二个字符大于第一个字符 返回 正数
* ASCII码对应值:
* A-Z 65-90
* a-z 97-122
* 其中忽略了a-x之间字符的大小写敏感度,经过下面的处理后其比较字符表从Ascii码转换成
* A a B b C c .... X x Y Z y z
* 65 66 67 68.........111 112 113 114 121 122
* 至于为何YZyz四个字符没有作相同处理暂时还不知其原由
*/
private
static
int compareCharsAlphabetically(
int cCompare2,
int cRef) {
int cCompare
=
0;
if (cCompare2
>
=
65) {
//从A开始
if (cCompare2
<
89) {
//A-Y之间的字符(不包含Y)
cCompare
= (
2
* cCompare2)
-
65;
}
else
if (cCompare2
<
97) {
//在Y-a之间的字符(不包含a)
cCompare
= cCompare2
+
24;
}
else
if (cCompare2
<
121) {
//在a-y之间的字符(不包含y)
cCompare
= (
2
* cCompare2)
-
128;
}
else
cCompare
= cCompare2;
}
else
//A之前的字符(不包含A)
cCompare
= cCompare2;
if (cRef
<
65) {
return cCompare
- cRef;
}
if (cRef
<
89) {
return cCompare
- ((
2
* cRef)
-
65);
}
if (cRef
<
97) {
return cCompare
- (cRef
+
24);
}
if (cRef
<
121) {
return cCompare
- ((
2
* cRef)
-
128);
}
return cCompare
- cRef;
}
/** The default number of values returned by the matchAlmost
method. */
private
int defaultNumReturnValues
=
-
1;
//默认返回值
/** the number of differences allowed in a call to the matchAlmostKey
method. */
private
int matchAlmostDiff;
//
/** The base node in the trie. */
private TSTNode rootNode;
//根节点
/**
* Constructs an empty Ternary Search Trie.
*/
public TernarySearchTrie() {
}
/**
* Constructs a Ternary Search Trie and loads data from a File
into the Trie.
* The file is a normal text document, where each line is of the form
* word : integer.
*
*@param file The File
with the data to load into the Trie.
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file)
throws IOException {
this(file,false);
}
/**
* 从文件中载入数据到字典树
* 一个普通文本文档每行的格式:word : integer
* Constructs a Ternary Search Trie and loads data from a File
into the Trie.
* The file is a normal text document, where each line is of the form " word : integer".
*
*@param file The File
with the data to load into the Trie.
*@param compression If true, the file is compressed with the GZIP algorithm, and if false,
* the file is a normal text document.
* true:文件根据GZIP算法压缩
* false:普通的文本文档
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file,
boolean compression)
throws IOException {
this();
BufferedReader in;
//如果是压缩文件则通过建立解压缩输出流
if(compression) in
=
new BufferedReader(
new InputStreamReader(
new GZIPInputStream(
new FileInputStream(file))));
else in
=
new BufferedReader(
new InputStreamReader((
new FileInputStream(file))));
String word;
int pos;
int occur;
int numWords
=
0;
while ((word
= in.readLine())
!= null) {
//abc:4
numWords
++;
pos
= word.indexOf(
":");
occur
=
1;
if (pos
!=
-
1) {
occur
=
(
new Integer(word.substring(pos
+
1).trim())).intValue();
word
= word.substring(
0, pos);
}
String key
= StringUtils.toLowerCase(word, false);
//abc
if (rootNode
== null) {
rootNode
=
new TSTNode(key.charAt(
0), null);
}
//从根节点比较获取key对应的节点
TSTNode node
= null;
if (key.length()
>
0
&& rootNode
!= null) {
TSTNode currentNode
= rootNode;
int charIndex
=
0;
while (true) {
if (currentNode
== null)
//未找到对应的节点则跳出
break;
int charComp
=
compareCharsAlphabetically(
key.charAt(charIndex),
currentNode.splitchar);
if (charComp
==
0) {
charIndex
++;
if (charIndex
== key.length()) {
//找到key对应的节点
node
= currentNode;
break;
}
currentNode
= currentNode.relatives[TSTNode.EQKID];
}
else
if (charComp
<
0) {
//左节点
currentNode
= currentNode.relatives[TSTNode.LOKID];
}
else {
//右节点
currentNode
= currentNode.relatives[TSTNode.HIKID];
}
}
Integer occur2
= null;
if (node
!= null)
occur2
= ((Integer) (node.data));
if (occur2
!= null) {
//如果存在多行key相同的数据,则把后面的数字累加
occur
+= occur2.intValue();
}
//根据key获取一个TSTNode节点,当key对应的节点不存在则创建一个返回
currentNode
=
getOrCreateNode(
StringUtils.toLowerCase(word.trim(), false));
//abc:4 则节点中data存的是4;当存在多行abc:*,则节点中data存的是"abc:"后面的数字之和
currentNode.data
=
new Integer(occur);
}
}
in.close();
}
/**
* Deletes the node passed in as an argument. If this node
* has non-null data, then both the node and the data will be deleted. It also
* deletes any other nodes in the trie that are no longer needed after the
* deletion of the node.
*
*@param nodeToDelete The node to delete.
*/
private
void deleteNode(TSTNode nodeToDelete) {
if (nodeToDelete
== null) {
return;
}
nodeToDelete.data
= null;
//设置当前节点的值为null
while (nodeToDelete
!= null) {
nodeToDelete
= deleteNodeRecursion(nodeToDelete);
//递归删除节点
//deleteNodeRecursion(nodeToDelete);
}
}
/**
* Recursivelly visits each node to be deleted.
* 递归删除节点:
* 还有一个疑问是当节点左右两边子树都不为null的情况下
* To delete a node, first set its data to null, then pass it into this method,
* then pass the node returned by this method into this method (make
* sure you don't delete the data of any of the nodes returned from this
* method!) and continue in this fashion until the node returned by this
* method is null
.
*
* The TSTNode instance returned by this method will be next node to
* be operated on by deleteNodeRecursion
(This emulates recursive
* method call while avoiding the JVM overhead normally associated
* with a recursive method.)
*
*@param currentNode The node to delete.
*@return The next node to be called in deleteNodeRecursion.
*/
private TSTNode deleteNodeRecursion(TSTNode currentNode) {
if (currentNode
== null) {
return null; }
// can't delete this node if it has a non-null eq kid or data
//当前节点下面存在相等的非空节点,或当前节点存在数值不为null则说明存在其他key值对应该节点则不允许删除该节点
if (currentNode.relatives[TSTNode.EQKID]
!= null
|| currentNode.data
!= null) {
return null;
}
TSTNode currentParent
= currentNode.relatives[TSTNode.PARENT];
//获取当前节点的父节点
boolean lokidNull
= currentNode.relatives[TSTNode.LOKID]
== null;
//判断当前节点左边是否为空
boolean hikidNull
= currentNode.relatives[TSTNode.HIKID]
== null;
//判断当前节点右边是否为空
int childType;
if (currentParent.relatives[TSTNode.LOKID]
== currentNode) {
childType
= TSTNode.LOKID;
}
else
if (currentParent.relatives[TSTNode.EQKID]
== currentNode) {
childType
= TSTNode.EQKID;
}
else
if (currentParent.relatives[TSTNode.HIKID]
== currentNode) {
childType
= TSTNode.HIKID;
}
else {
rootNode
= null;
return null;
}
//下面实现了删除当前节点的
if (lokidNull
&& hikidNull) {
//当前节点的左边为null,右边为null
currentParent.relatives[childType]
= null;
//设置父节点指向的当前节点为null
return currentParent;
//返回父节点
}
if (lokidNull) {
//当前节点的左边为null,右边不为null
currentParent.relatives[childType]
=
currentNode.relatives[TSTNode.HIKID];
currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT]
=
currentParent;
return currentParent;
}
if (hikidNull) {
//当前节点的右边为null,左边不为null,
currentParent.relatives[childType]
=
currentNode.relatives[TSTNode.LOKID];
currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT]
=
currentParent;
return currentParent;
}
//当前节点两边都不为null
int deltaHi
=
currentNode.relatives[TSTNode.HIKID].splitchar
- currentNode.splitchar;
//当前节点右边节点字符与它的字符的差值
int deltaLo
=
currentNode.splitchar
- currentNode.relatives[TSTNode.LOKID].splitchar;
//当前节点左边节点字符与它的字符的差值
int movingKid;
TSTNode targetNode;
if (deltaHi
== deltaLo) {
if (Math.random()
<
0.
5) {
deltaHi
++;
}
else {
deltaLo
++;
}
}
if (deltaHi
> deltaLo) {
movingKid
= TSTNode.HIKID;
targetNode
= currentNode.relatives[TSTNode.LOKID];
}
else {
movingKid
= TSTNode.LOKID;
targetNode
= currentNode.relatives[TSTNode.HIKID];
//
}
while (targetNode.relatives[movingKid]
!= null) {
targetNode
= targetNode.relatives[movingKid];
}
targetNode.relatives[movingKid]
= currentNode.relatives[movingKid];
//下面的指向语句一直让我不明白甚至开始怀疑自己是否真的看懂了三叉树构建的那个方法
//不明白的是为何指向的是目标节点的最后一个节点,而其他节点全部被去掉了???(刚看了一下代码不知这是否和要求插入树的节点按顺序排列依次添加有关的原因,某一个节点左右分支有其特殊性,所以我的顾虑其实根本不是问题)
currentParent.relatives[childType]
= targetNode;
targetNode.relatives[TSTNode.PARENT]
= currentParent;
if (
!lokidNull) {
currentNode.relatives[TSTNode.LOKID]
= null;
}
if (
!hikidNull) {
currentNode.relatives[TSTNode.HIKID]
= null;
}
return currentParent;
}
/**
* Retrieve the object indexed by a key.
* 返回key对应的节点的数值data
*@param key A String
index.
*@return The object retrieved from the Ternary Search Trie.
*/
public Object get(String key) {
TSTNode node
= getNode(StringUtils.toLowerCase(key.trim(), false));
if (node
== null) {
return null; }
return node.data;
}
/**
* Retrieve the Integer
indexed by key, increment it by one unit
* and store the new Integer
.
* 获取并且节点中的data值自动加1
*@param key A String
index.
*@return The integer
retrieved from the Ternary Search Trie.
*/
public Integer getAndIncrement(String key) {
String key2
= StringUtils.toLowerCase(key.trim(), false);
TSTNode node
= getNode(key2);
if (node
== null) {
return null;
}
Integer aux
= (Integer) (node.data);
if (aux
== null) {
aux
=
new Integer(
1);
}
else {
aux
=
new Integer(aux.intValue()
+
1);
}
put(key2, aux);
return aux;
}
/**
* Returns the key that indexes the node argument.
* 返回索引参数节点的key
*@param node The node whose index is to be calculated.
*@return The String
that indexes the node argument.
* a
* |
* a
* | /
* a(50) b (12)
* / |
* b(6) a(8)
* / /
* f(6) c(11)
* 从上面树从左到右叶子节点的key分别为:
* aaa:50
* aab:6
* aaf:6
* ab:12
* aba:8
* abc:11 */
protected String getKey(TSTNode node) {
StringBuffer getKeyBuffer
=
new StringBuffer();
getKeyBuffer.setLength(
0);
getKeyBuffer.append(
""
+ node.splitchar);
TSTNode currentNode;
TSTNode lastNode;
currentNode
= node.relatives[TSTNode.PARENT];
//当前节点指向其父节点
lastNode
= node;
//最后一个节点指向当前节点
while (currentNode
!= null) {
if (currentNode.relatives[TSTNode.EQKID]
== lastNode) {
//当前节点==最后指针指向的节点
getKeyBuffer.append(
""
+ currentNode.splitchar);
}
lastNode
= currentNode;
currentNode
= currentNode.relatives[TSTNode.PARENT];
}
getKeyBuffer.reverse();
//反转顺序
return getKeyBuffer.toString();
}
/**
* Returns the node indexed by key, or null
if that node doesn't exist.
* Search begins at root node.
* 从根节点开始搜索key对应的节点node
*@param key A String
that indexes the node that is returned.
*@return The node object indexed by key. This object is an
* instance of an inner class named TernarySearchTrie.TSTNode
.
*/
public TSTNode getNode(String key) {
return getNode(key, rootNode);
}
/**
* Returns the node indexed by key, or null
if that node doesn't exist.
* The search begins at root node.
* 从某节点开始获取key2对应的节点
*@param key2 A String
that indexes the node that is returned.
*@param startNode The top node defining the subtrie to be searched.
*@return The node object indexed by key. This object is
* an instance of an inner class named TernarySearchTrie.TSTNode
.
*/
protected TSTNode getNode(String key2, TSTNode startNode) {
String key
= StringUtils.toLowerCase(key2.trim(), false);
if (key
== null
|| startNode
== null
|| key.length()
==
0) {
return null;
}
TSTNode currentNode
= startNode;
int charIndex
=
0;
while (true) {
if (currentNode
== null) {
return null;
}
int charComp
=
compareCharsAlphabetically(
key.charAt(charIndex),
currentNode.splitchar);
if (charComp
==
0) {
charIndex
++;
if (charIndex
== key.length()) {
return currentNode;
}
currentNode
= currentNode.relatives[TSTNode.EQKID];
}
else
if (charComp
<
0) {
currentNode
= currentNode.relatives[TSTNode.LOKID];
}
else {
currentNode
= currentNode.relatives[TSTNode.HIKID];
}
}
}
/**
* 根据key获取一个TSTNode节点
* 当key对应的节点不存在时则创建
* Returns the node indexed by key, creating that node if it doesn't exist,
* and creating any required intermediate nodes if they don't exist.
*
*@param key A String
that indexes the node that is returned.
*@return The node object indexed by key. This object is an
* instance of an inner class named TernarySearchTrie.TSTNode
.
*@exception NullPointerException If the key is null
.
*@exception IllegalArgumentException If the key is an empty String
.
*/
protected TSTNode getOrCreateNode(String key)
throws NullPointerException, IllegalArgumentException {
if (key
== null) {
throw
new NullPointerException(
"attempt to get or create node with null key");
}
if (key.length()
==
0) {
throw
new IllegalArgumentException(
"attempt to get or create node with key of zero length");
}
if (rootNode
== null) {
rootNode
=
new TSTNode(key.charAt(
0), null);
}
TSTNode currentNode
= rootNode;
int charIndex
=
0;
while (true) {
int charComp
=
compareCharsAlphabetically(
key.charAt(charIndex),
currentNode.splitchar);
if (charComp
==
0) {
charIndex
++;
if (charIndex
== key.length()) {
//找到key对应的节点
return currentNode;
}
if (currentNode.relatives[TSTNode.EQKID]
== null) {
//key对应的节点不存在则创建
currentNode.relatives[TSTNode.EQKID]
=
new TSTNode(key.charAt(charIndex), currentNode);
}
currentNode
= currentNode.relatives[TSTNode.EQKID];
//当前节点指向新创建的key对应的节点
}
else
if (charComp
<
0) {
if (currentNode.relatives[TSTNode.LOKID]
== null) {
currentNode.relatives[TSTNode.LOKID]
=
new TSTNode(key.charAt(charIndex), currentNode);
}
currentNode
= currentNode.relatives[TSTNode.LOKID];
}
else {
if (currentNode.relatives[TSTNode.HIKID]
== null) {
currentNode.relatives[TSTNode.HIKID]
=
new TSTNode(key.charAt(charIndex), currentNode);
}
currentNode
= currentNode.relatives[TSTNode.HIKID];
}
}
}
/**
* Removes the value indexed by key. Also removes all nodes that are rendered
* unnecessary by the removal of this data.
* 删除一个节点
*@param key A string
that indexes the object to be removed from the Trie.
*/
public
void remove(String key) {
deleteNode(getNode(StringUtils.toLowerCase(key.trim(), false)));
}
/**
* Stores a value in the trie. The value may be retrieved using the key.
* 重新设置key对应节点的data值
*@param key A String
that indexes the object to be stored.
*@param value The object to be stored in the Trie.
*/
public
void put(String key, Object value) {
getOrCreateNode(StringUtils.toLowerCase(key.trim(), false)).data
=
value;
}
//////////////////////////////////////////////////////////
// 计算树的节点数
//////////////////////////////////////////////////////////
/**
* Returns the number of nodes in the trie that have non-null data.
* 返回data非空的节点数
*@return The number of nodes in the trie that have non-null data.
*/
public
int numDataNodes() {
return numDataNodes(rootNode);
}
/**
* Returns the number of nodes in the subtrie below and including the
* starting node. The method counts only nodes that have non-null data.
* 从某节点开始返回其子树里的data非空的节点数
*@param startingNode The top node of the subtrie. the node that defines the subtrie.
*@return The total number of nodes in the subtrie.
*/
protected
int numDataNodes(TSTNode startingNode) {
return recursiveNodeCalculator(startingNode, true,
0);
}
/**
* Returns the total number of nodes in the trie. The method counts nodes whether
* or not they have data.
* 返回树中的所有节点,不管data是否为null
*@return The total number of nodes in the trie.
*/
public
int numNodes() {
return numNodes(rootNode);
}
/**
* Returns the total number of nodes in the subtrie below and including the
* starting Node. The method counts nodes whether or not they have data.
* 从某节点开始返回其子树里所有节点数
*@param startingNode The top node of the subtrie. The node that defines the subtrie.
*@return The total number of nodes in the subtrie.
*/
protected
int numNodes(TSTNode startingNode) {
return recursiveNodeCalculator(startingNode, false,
0);
}
/**
* Recursivelly visists each node to calculate the number of nodes.
* 计算节点数
*@param currentNode The current node. 当前开始节点
*@param checkData If true we check the data to be different of null
. true:排除data值为null的节点
*@param numNodes2 The number of nodes so far. 到目前为止的计数,主要用于内部递归传参
*@return The number of nodes accounted.
*/
private
int recursiveNodeCalculator(
TSTNode currentNode,
boolean checkData,
int numNodes2) {
if (currentNode
== null) {
return numNodes2;
}
//左边树节点数
int numNodes
=
recursiveNodeCalculator(
currentNode.relatives[TSTNode.LOKID],
checkData,
numNodes2);
//中间树节点数
numNodes
=
recursiveNodeCalculator(
currentNode.relatives[TSTNode.EQKID],
checkData,
numNodes);
//右边树节点数
numNodes
=
recursiveNodeCalculator(
currentNode.relatives[TSTNode.HIKID],
checkData,
numNodes);
//下面才是真正的计算代码
if (checkData) {
if (currentNode.data
!= null) {
numNodes
++;
}
}
else {
numNodes
++;
}
return numNodes;
}
///////////////////////////////////////////////////////
// 获取相匹配节点列表的方法,(差距匹配、前缀匹配)
///////////////////////////////////////////////////////
/**
* Returns a List
of keys that almost match the argument key. Keys returned
* will have exactly diff characters that do not match the target key,
* where diff is equal to the last value passed in as an argument to the
* setMatchAlmostDiff
method.
*
* If the matchAlmost
method is called before the setMatchAlmostDiff
method has
* been called for the first time, then diff = 0.
*
*@param key The target key.
*@return A List
with the results.
*/
public List matchAlmost(String key) {
return matchAlmost(key, defaultNumReturnValues);
}
/**
* Returns a List
of keys that almost match the argument key. Keys returned
* will have exactly diff characters that do not match the target key,
* where diff is equal to the last value passed in as an argument to the
* setMatchAlmostDiff
method.
*
* If the matchAlmost
method is called before the setMatchAlmostDiff
method has
* been called for the first time, then diff = 0.
*
*@param key The target key.
*@param numReturnValues The maximum number of values returned by this method.
*@return A List
with the results
*/
protected List matchAlmost(String key,
int numReturnValues) {
return matchAlmostRecursion(
rootNode,
0,
matchAlmostDiff,
key,
((numReturnValues
<
0)
?
-
1
: numReturnValues),
new Vector(),
false);
}
/**
* Recursivelly vists the nodes in order to find the ones that almost match a given key.
* 递归找出与给定的key相匹配的节点集合
*@param currentNode The current node. 当前节点 即递归开始节点
*@param charIndex The current char. 当前字符下标
*@param d The number of differences so far. 到目前为止匹配到的不同数
*@param matchAlmostNumReturnValues The maximum number of values in the result List
. 匹配的最大结果个数
*@param matchAlmostResult2 The results so far. 目前为止匹配的结果列表
*@param upTo If true all keys having up to and including matchAlmostDiff
* mismatched letters will be included in the result (including
* a key that is exactly the same as the target string) otherwise
* keys will be included in the result only if they have exactly
* matchAlmostDiff number of mismatched letters.
* true:包含与key完全相同的目标串,否则仅仅返回差距不同个数为matchAlmostDiff的匹配字符串
*@param matchAlmostKey The key being searched. 被搜索的key
*@return A List
with the results.
*/
private List matchAlmostRecursion(
TSTNode currentNode,
int charIndex,
int d,
String matchAlmostKey,
int matchAlmostNumReturnValues,
List matchAlmostResult2,
boolean upTo) {
if ((currentNode
== null)
//到了叶子节点
|| (matchAlmostNumReturnValues
!=
-
1
&& matchAlmostResult2.size()
>
= matchAlmostNumReturnValues)
//matchAlmostNumReturnValues不等于-1并且大于最大返回结果数
|| (d
<
0)
//超出了给定的差距不同的个数
|| (charIndex
>
= matchAlmostKey.length())) {
//长度超出了key的长度
return matchAlmostResult2;
}
int charComp
=
compareCharsAlphabetically(
matchAlmostKey.charAt(charIndex),
currentNode.splitchar);
//开始key取出一个字符与当前节点字符比较
List matchAlmostResult
= matchAlmostResult2;
if ((d
>
0)
|| (charComp
<
0)) {
//差距个数范围内或者比较字符小于当前节点字符
matchAlmostResult
=
matchAlmostRecursion(
currentNode.relatives[TSTNode.LOKID],
//左子树
charIndex,
d,
matchAlmostKey,
matchAlmostNumReturnValues,
matchAlmostResult,
upTo);
}
int nextD
= (charComp
==
0)
? d
: d
-
1;
//如果字符相同则d不变,否则d减1
boolean cond
= (upTo)
? (nextD
>
=
0)
: (nextD
==
0);
//
if ((matchAlmostKey.length()
== charIndex
+
1)
&& cond
&& (currentNode.data
!= null)) {
//找到匹配的key
matchAlmostResult.add(getKey(currentNode));
//获取当前节点的key放入集合中
}
matchAlmostResult
=
matchAlmostRecursion(
currentNode.relatives[TSTNode.EQKID],
//中子树递归匹配
charIndex
+
1,
nextD,
matchAlmostKey,
matchAlmostNumReturnValues,
matchAlmostResult,
upTo);
if ((d
>
0)
|| (charComp
>
0)) {
//右子树递归匹配
matchAlmostResult
=
matchAlmostRecursion(
currentNode.relatives[TSTNode.HIKID],
charIndex,
d,
matchAlmostKey,
matchAlmostNumReturnValues,
matchAlmostResult,
upTo);
}
return matchAlmostResult;
}
/**
* Returns an alphabetical List
of all keys in the trie that begin with a given prefix.
* Only keys for nodes having non-null data are included in the List
.
* 返回前缀匹配的所有data为非null节点的key
*@param prefix Each key returned from this method will begin with the characters in prefix.
*@return A List
with the results.
*/
public List matchPrefix(String prefix) {
return matchPrefix(prefix, defaultNumReturnValues);
}
/**
* Returns an alphabetical List
of all keys in the trie that begin with a
* given prefix. Only keys for nodes having non-null data are included in the List
.
* 返回前缀匹配的所有data为非null节点的key
*@param prefix Each key returned from this method will begin with the characters in prefix.
*@param numReturnValues The maximum number of values returned from this method.
*@return A List
with the results
*/
public List matchPrefix(String prefix,
int numReturnValues) {
Vector sortKeysResult
=
new Vector();
TSTNode startNode
= getNode(prefix);
if (startNode
== null) {
return sortKeysResult;
}
if (startNode.data
!= null) {
sortKeysResult.addElement(getKey(startNode));
}
return sortKeysRecursion(
startNode.relatives[TSTNode.EQKID],
((numReturnValues
<
0)
?
-
1
: numReturnValues),
sortKeysResult);
}
/**
* Sets the number of characters by which words can differ from target word
* when calling the matchAlmost
method.
*
* Arguments less than 0 will set the char difference to 0, and arguments greater
* than 3 will set the char difference to 3.
* 匹配差距值0-3之间
*@param diff The number of characters by which words can differ from target word.
*/
public
void setMatchAlmostDiff(
int diff) {
if (diff
<
0) {
matchAlmostDiff
=
0;
}
else
if (diff
>
3) {
matchAlmostDiff
=
3;
}
else {
matchAlmostDiff
= diff;
}
}
/**
* Sets the default maximum number of values returned from the matchPrefix
and
* matchAlmost
methods.
*
* The value should be set this to -1 to get an unlimited number of return
* values. note that the methods mentioned above provide overloaded
* versions that allow you to specify the maximum number of return
* values, in which case this value is temporarily overridden.
*
**@param num The number of values that will be returned when calling the
* methods above.
*/
public
void setNumReturnValues(
int num) {
defaultNumReturnValues
= (num
<
0)
?
-
1
: num;
}
/**
* Returns keys sorted in alphabetical order. This includes the start Node and all
* nodes connected to the start Node.
*
* 返回安装字母表排序的keys
* The number of keys returned is limited to numReturnValues. To get a list that
* isn't limited in size, set numReturnValues to -1.
* 返回keys的个数不多于numReturnValues,但当numReturnValues=-1时则不受限制
*@param startNode The top node defining the subtrie to be searched. 开始节点
*@param numReturnValues The maximum number of values returned from this method. 返回个数
*@return A List
with the results.
*/
protected List sortKeys(TSTNode startNode,
int numReturnValues) {
return sortKeysRecursion(
startNode,
((numReturnValues
<
0)
?
-
1
: numReturnValues),
new Vector());
}
/**
* Returns keys sorted in alphabetical order. This includes the current Node and all
* nodes connected to the current Node.
*
* Sorted keys will be appended to the end of the resulting List
. The result may be
* empty when this method is invoked, but may not be null
.
*
*@param currentNode The current node.
*@param sortKeysNumReturnValues The maximum number of values in the result. 返回最大个数
*@param sortKeysResult2 The results so far. 到目前为止已排序的结果
*@return A List
with the results.
*/
private List sortKeysRecursion(
TSTNode currentNode,
int sortKeysNumReturnValues,
List sortKeysResult2) {
if (currentNode
== null) {
return sortKeysResult2;
}
//下面从左节点开始递归,接着是中节点递归,最后右节点递归,这样就保证了key按字母顺序排序
//从左节点开始递归
List sortKeysResult
=
sortKeysRecursion(
currentNode.relatives[TSTNode.LOKID],
sortKeysNumReturnValues,
sortKeysResult2);
//已经达到返回个数,则返回结果
if (sortKeysNumReturnValues
!=
-
1
&& sortKeysResult.size()
>
= sortKeysNumReturnValues) {
return sortKeysResult;
}
//节点顺序不为null则把当前节点的key和data实例化一个TSTItem元素插入到列表中
if (currentNode.data
!= null) {
sortKeysResult.add(
new TSTItem(getKey(currentNode),currentNode.data));
}
//从中节点开始递归
sortKeysResult
=
sortKeysRecursion(
currentNode.relatives[TSTNode.EQKID],
sortKeysNumReturnValues,
sortKeysResult);
//从右节点开始递归
return sortKeysRecursion(
currentNode.relatives[TSTNode.HIKID],
sortKeysNumReturnValues,
sortKeysResult);
}
/**
* 三叉树迭代器
* @return
*/
public Enumeration keys() {
return
new Iterator();
}
public
class Iterator
implements Enumeration {
private Stack stack;
private TSTNode currentNode;
public Iterator() {
this.currentNode
= null;
this.stack
= null;
}
public Object nextElement() {
if (currentNode
==null)
throw
new java.util.NoSuchElementException(
"out of range");
return currentNode;
}
public
boolean hasMoreElements() {
// we are at the beginning
if (stack
==null)
{
stack
=
new Stack();
currentNode
=null;
if (rootNode
!=null)
stack.push(rootNode);
//把根节点放入堆栈里
}
// we are at the end node, finished
else
if (currentNode
==null)
return false;
if (stack.size()
==
0)
currentNode
=null;
while (stack.size()
>
0)
//从根节点开始,依次从右节点,中间节点,左节点判断若不为null则加入堆栈中
{
currentNode
= (TSTNode)stack.pop();
if (currentNode.relatives[TSTNode.HIKID]
!=null)
stack.push(currentNode.relatives[TSTNode.HIKID]);
if (currentNode.relatives[TSTNode.EQKID]
!=null)
stack.push(currentNode.relatives[TSTNode.EQKID]);
if (currentNode.relatives[TSTNode.LOKID]
!=null)
stack.push(currentNode.relatives[TSTNode.LOKID]);
//if (currentNode.IsKey)
if (currentNode.data
!= null)
break;
}
return currentNode
!= null;
}
}
/////////////////////////////////////////////////////////////////
// 根据三叉树生成字典写入文件
/////////////////////////////////////////////////////////////////
/**
* Recursively insert the median first and then the median of the
* lower and upper halves, and so on in order to get a balanced
* tree. The array of keys is assumed to be sorted in ascending
* order.
* 递归首先插入中间然后插入上半部分,下半部分,最后得到一个平衡树
* keys数组假装是按升序排序
* @param fp BufferedWriter
* @param k key列表
* @param offset 开始偏移量
* @param n 总记录数
* @throws Exception
*/
protected
void outputBalanced(BufferedWriter fp,List k,
int offset,
int n)
throws Exception
{
int m;
if (n
<
1) {
//总个数小于1
return;
}
m
= n
>>
1;
//右移
TSTItem item
= (TSTItem)(k.get(m
+ offset));
fp.write(item.key
+
" : "
+item.data);
fp.write(
'/n');
outputBalanced(fp,k,offset, m);
outputBalanced(fp,k, offset
+ m
+
1, n
- m
-
1);
}
/**
* Make balance result when output the tree.
* 把树写入文件
* @param sFilename
* @throws Exception
*/
public
void save(String sFilename)
throws Exception
{
BufferedWriter fp
=
new BufferedWriter(
new FileWriter(sFilename));
List ret
= sortKeys(rootNode,
-
1);
//从根节点开始获取所有data不为null的节点的key,并且所有key是按字母顺序排序
int n
= ret.size();
//out put in balance order
outputBalanced(fp,ret,
0, n);
fp.close();
}
/////////////////////////////////////////////////////////////////
// 从文件生成三叉树,并读取文件里的内容
/////////////////////////////////////////////////////////////////
/**
*
* @param dicFile
* @throws Exception
*/
public
static
void benchMark(String dicFile)
throws Exception{
String word;
TernarySearchTrie dic
=
new TernarySearchTrie(
new File(dicFile));
System.out.println(
"count nodes num:"
+dic.numNodes());
//dic.balance();
//System.out.println("count nodes num:"+dic.numNodes());
String sParagraph;
ArrayList table
=
new ArrayList();
BufferedReader fpSource;
try{
fpSource
=
new BufferedReader(
new FileReader(dicFile));
while( true )
{
sParagraph
= fpSource.readLine();
if (sParagraph
== null )
break;
StringTokenizer st
=
new StringTokenizer(sParagraph,
":" );
table.add(st.nextToken());
//加入所有key
}
fpSource.close();
}
catch(Exception e)
{
e.printStackTrace();
}
System.out.println(
"begin test dic:"
+dicFile
+
" table size:"
+table.size());
long start
= System.currentTimeMillis();
for (
int i
=
0;i
<table.size();
++i)
{
word
= (String)(table.get(i));
dic.get(word);
//获取key对应的data
//System.out.println("test:"+word);
//if (dic.get(word)== false)
// System.out.println("test:"+word);
}
long end
= System.currentTimeMillis();
System.out.println(
"time cost:"
+(end
- start));
}
}
注意:通过调试代码发现原来自己一直有一个理解的误区,就是中节点TSTNode.EQKID下标存的值一直认为是和其父节点字符相同的节点。但其实这是一种错误的理解。看下面的代码,其实它比较的字符与创建的字符不是同一个,它是先创建后比较: