链表
每个元素都是一个对象,每个对象称为一个节点,里面有两个属性,分别为自身值和指向下一个节点的指针next
,通过各个节点间相互连接,所以要想知道某一个节点的值,必须通过从头节点开始一个个指过去。而插入和删除时只需改变该节点和左右两边的节点即可,其特点概况就是查找慢,但插入删除快
简单例子
class Node(object):
def __init__(self, item):
self.item = item
self.next = None
a = Node(10)
b = Node(20)
c = Node(30)
a.next = b #a的下一个节点指向b
b.next = c
print(a.item) #10
print(a.next.item) #20
print(a.next.next.item) #30
遍历链表
class Node(object):
def __init__(self, item):
self.item = item
self.next = None
def traversal(head):
curNode = head #当前节点从头结点开始
while curNode is not None:
print(curNode.item)
curNode = curNode.next #依次遍历节点
head = Node(10)
head.next = Node(20)
head.next.next = Node(30)
traversal(head)
建立链表
- 头插法
class Node(object):
def __init__(self, item):
self.item = item
self.next = None
def createLinkListF(li):
l = Node(0)
for num in li:
s = Node(num)
s.next = l.next #每一个都插入到第一个节点和第二个节点之间
l.next = s
return l
def traversal(head):
curNode = head
while curNode is not None:
print(curNode.item)
curNode = curNode.next
li = [1,2,3,4,5,6]
l = createLinkListF(li)
traversal(l) #0,6,5,4,3,2,1
- 尾插法
class Node(object):
def __init__(self, item):
self.item = item
self.next = None
def createLinkListR(li):
l = Node(0)
r = l
for num in li:
s = Node(num)
r.next = s #每个都插入到最后一个位置
r = s #节点移到最后一个
return l
def traversal(head):
curNode = head
while curNode is not None:
print(curNode.item)
curNode = curNode.next
li = [1,2,3,4,5,6]
l = createLinkListR(li)
traversal(l) #0,1,2,3,4,5,6
基于单链表实现的数组
class List:
'''链表数组实现'''
def __init__(self):
# 头指针及数组尺寸
self.first = self.Node(None, None)
self.size = 0
class Node:
'''链表单元节点'''
def __init__(self, element, next):
self.element = element
self.next = next
def _get_node(self, index):
'''获取数组指定索引的节点'''
node = self.first
if index == 0:
return node
elif 0 < index <= self.size:
for _ in range(index):
node = node.next
return node
else:
raise Exception("index out of list")
def index_of(self, index):
'''访问指定索引下的值'''
return self._get_node(index + 1).element
def add(self, element):
'''在尾部添加值'''
tmp = self._get_node(self.size)
tmp.next = self.Node(element, None)
self.size += 1
def insert(self, index, element):
'''在指定索引处插入值'''
tmp = self._get_node(index)
new_node = self.Node(element, None)
new_node.next = tmp.next
tmp.next = new_node
self.size += 1
def delete(self, index):
'''删除指定索引的值'''
if index >= self.size:
raise Exception("index out of list")
tmp = self._get_node(index)
tmp.next = tmp.next.next
self.size -= 1
def update(self, index, element):
'''更新指定索引的值'''
tmp = self._get_node(index + 1)
tmp.element = element
def get_size(self):
'''返回数组大小'''
return self.size
def pop(self):
'''pop实现:获取倒数第二个,从而返回倒数第一个的值,然后将倒数第二个的next指向空,并把size-1'''
tmp = self._get_node(self.size - 1)
element = tmp.next.element
tmp.next = None
self.size -= 1
return element
def clear(self):
'''把头指针的next指向空,size置0'''
self.first.next = None
self.size = 0
def reverse(self):
'''递归实现倒序:新建一个空数组,递归到原数组的底部开始指向'''
self.new = self.Node(None, None)
self._reverse(self.first.next)
self.first = self.new
del self.new
def _reverse(self, node):
'''内部倒叙递归实现:当到达最后一个节点位置,用新的数组指向'''
if node.next:
self._reverse(node.next)
node.next = None
new = self.new
while new.next:
new = new.next
new.next = node
return
def sort(self):
'''冒泡排序,不是交换节点,而是交换节点的值,否则十分麻烦,可以看下一个例子'''
for i in range(self.size):
node = self.first
while node.next:
node = node.next
if node.next and node.element > node.next.element:
tmp = node.next.element
node.next.element = node.element
node.element = tmp
# def sort(self):
# '''冒泡排序,基于交换节点的方式,此时需要考虑是第一个节点的情况和中间节点的情况'''
# for i in range(self.size):
# node = self.first.next
# if node.next:
# if node.element > node.next.element:
# head = node.next
# node.next = node.next.next
# head.next = node
# self.first.next = head
# while node.next:
# if node.next.next:
# if node.next.element > node.next.next.element:
# head = node.next.next
# node.next.next = head.next
# head.next = node.next
# node.next = head
# node = node.next
def __str__(self):
'''打印格式'''
node = self.first
s = '['
while node.next:
node = node.next
if s != '[':
s += ", " + str(node.element)
else:
s += str(node.element)
s += "]"
return str(s)
if __name__ == '__main__':
# 测试
l = List()
l.add(1)
l.add(2)
l.add(3)
l.add(4)
l.insert(1, 5)
l.insert(0, 6)
print(l.index_of(5))
print(l, l.size)
l.delete(5)
print(l, l.size)
l.update(2, 100)
print(l, l.size)
print(l.pop())
print(l, l.size)
l.reverse()
print(l, l.size)
l.sort()
print(l, l.size)
l.clear()
print(l, l.size)
双链表
和单链表相比,多了个指向前一个节点的指针,如下:
class Node(object):
def __init__(self, item):
self.item = item
self.next = None
self.prior = None
栈
后进先出的数据结构,实现起来很简单
简单示例
class Stack:
def __init__(self):
self.stack = []
def push(self, data):
self.stack.append(data)
def pop(self):
return self.stack.pop()
def get(self):
return self.stack[-1]
if __name__ == '__main__':
s = Stack()
s.push(1)
s.push(2)
s.push(3)
print(s.pop())
print(s.pop())
print(s.get())
更加原始的实现示例
前面的实现是基于列表自带的api实现的,如果希望更加接近原始的实现方式,则看如下示例:
class Stack:
def __init__(self):
self.size = 8
self.stack = [None] * self.size
self.index = 0
def push(self, v):
self._size_check()
self.stack[self.index - 1] = v
def pop(self):
self._empty_check()
self.index -= 1
return self.stack[self.index]
def get(self):
return self.stack[self.index - 1]
def _size_check(self):
self.index += 1
if self.index < self.size:
return
tstack = self.stack
self.size = int(1.5 * self.size)
self.stack = [None] * self.size
for i in range(self.index):
self.stack[i] = tstack[i]
def _empty_check(self):
if self.index < 1:
raise Exception("当前栈为空!")
def __str__(self):
return str(self.stack[:self.index])
if __name__ == '__main__':
s = Stack()
s.push(5)
s.push(3)
s.push(2)
print(s.get())
s.pop()
s.pop()
s.pop()
s.push(1)
print(s)
队列
先进先出的数据结构,和栈差不多
简单示例
class Queue:
def __init__(self):
self.queue = []
def put(self, data):
self.queue.append(data)
def poll(self):
return self.queue.pop(0)
def get(self):
return self.queue[0]
if __name__ == "__main__":
q = Queue()
q.put(1)
q.put(2)
q.put(3)
print(q.poll())
print(q.poll())
print(q.get())
上面的是基于动态数组实现的,每当删除第一个元素时,都需要将后面的元素往前挪,效率偏低,而使用链表实现则能够很好地提升效率,示例如下:
class Queue:
class Node:
def __init__(self, v):
self.v = v
self.prev = None
self.next = None
def __init__(self):
self.head = self.Node(None)
self.tail = self.head
self.size = 0
def add(self, v):
self.tail.next = self.Node(v)
self.tail = self.tail.next
self.size += 1
def pop(self):
if self.size <= 0:
return
first = self.head.next
if first is self.tail:
self.tail = self.head
self.head.next = first.next
self.size -= 1
return first.v
def get_size(self):
return self.size
树
二叉搜索树
左子节点的值都比当前节点的小,右子节点的值都比当前节点的大
简单实现
class BST:
class Node:
def __init__(self, val, parent):
self.val = val
self.parent = parent
self.left = None
self.right = None
def __repr__(self):
return f"{self.val}(left:({self.left})right:({self.right}))"
def __init__(self, tree = []):
self.root = self.Node(None, None)
self.size = 0
while tree:
self.add(tree.pop(0))
def add(self, val):
"""
@description:插入节点(如果父节点为空,则直接在根节点赋值;如果跟父节点的值一样,则覆盖;如果大于父节点的值,则成为父节点的右子节点;如果小于,则成为左子节点)
"""
self._val_check(val)
parent = self._find_parent(val)
if not parent:
self.root.val = val
elif val == parent.val:
return val
elif val > parent.val:
parent.right = self.Node(val, parent)
else:
parent.left = self.Node(val, parent)
self.size += 1
return val
def _find_parent(self, val):
"""
@description:找到指定数据能够插入的父节点(如果是空,说明为根节点;如果是同样的值,那么父节点存放的就是和当前同样的值)
"""
node = self.root
parent = node.parent
while node and node.val:
if val == node.val:
return node
parent = node
if val > node.val:
node = node.right
else:
node = node.left
return parent
def remove(self, val):
"""
@description:删除节点
"""
self._val_check(val)
parent = self._find_parent(val)
if not (parent and parent.val == val):
return
# 度为2的节点,使用当前节点的前驱或者后继节点覆盖当前节点,并将覆盖的原节点删除(这里使用前驱节点)
# 并且度为2的前驱或者后继节点的度一定为0或者1
if self._has_two_child(parent):
prev = self._find_prev(parent)
parent.val = prev.val
parent = prev
# 度为0的节点,直接将该节点删除
if not self._has_child(parent):
self._replace_node(parent, None)
# 度为1的节点,直接将当前节点的父节点指向当前节点的子节点
elif self._has_one_child(parent):
if self._has_left_child(parent):
node = parent.left
if self._has_right_child(parent):
node = parent.right
self._replace_node(parent, node)
self.size -= 1
def _replace_node(self, node, new_node):
"""
@description:将当前节点替换
"""
if node.parent.left == node:
node.parent.left = new_node
elif node.parent.right == node:
node.parent.right = new_node
def _find_prev(self, node):
"""
@description:寻找前驱节点
"""
mid_tree = self.mid_order(False)
for i, n in enumerate(mid_tree):
if n == node:
return mid_tree[i - 1] if i - 1 >= 0 else None
return None
def get_height(self):
"""
@description:获取树高度(根节点的高度)
"""
return self._get_node_height(self.root)
def _get_node_height(self, node):
"""
@description:获取节点高度(最高的子节点高度加1)
"""
if not node:
return 0
return max(self._get_node_height(node.left), self._get_node_height(node.right), 0) + 1
def constains(self, val):
"""
@description:是否包含某个节点(能够找到父节点,且父节点的值和传入的一样则说明存在)
"""
self._val_check(val)
parent = self._find_parent(val)
return parent and parent.val == val
def is_empty(self):
"""
@description:是否为空
"""
return self.size == 0
def get_size(self):
"""
@description:获取树的节点数
"""
return self.size
def clear(self):
"""
@description:清空树
"""
self.root.val = None
self.root.left = None
self.root.right = None
self.size = 0
def pre_order(self, output=True):
"""
@description:前序遍历
"""
def travel(node):
if not node:
return
res.append(node)
travel(node.left)
travel(node.right)
res = []
travel(self.root)
if output:
print([node.val for node in res])
return res
def mid_order(self, output=True):
"""
@description:中序遍历
"""
def travel(node):
if not node:
return
travel(node.left)
res.append(node)
travel(node.right)
res = []
travel(self.root)
if output:
print([node.val for node in res])
return res
def next_order(self, output=True):
"""
@description:后序遍历
"""
def travel(node):
if not node:
return
travel(node.left)
travel(node.right)
res.append(node)
res = []
travel(self.root)
if output:
print([node.val for node in res])
return res
def layer_order(self, output=True):
"""
@description:层序遍历
"""
res = []
queue = [(self.root, 0)]
while queue:
cur, layer = queue.pop(0)
res.append(cur)
if self._has_left_child(cur):
queue.append((cur.left, layer + 1))
if self._has_right_child(cur):
queue.append((cur.right, layer + 1))
if output:
print([node.val for node in res])
return res
def _val_check(self, val):
"""
@description:判断是否为可比较类型的值
"""
if not type(val) in [int, float]:
raise Exception(f"传入类型必须为int/float型,你传入的是:{type(val)}")
def _is_root(self, node):
"""
@description:是否为根节点
"""
return node == self.root
def _has_one_child(self, node):
"""
@description:是否只存在一个子节点
"""
return self._has_child(node) and not self._has_two_child(node)
def _has_two_child(self, node):
"""
@description:是否存在两个子节点
"""
return self._has_left_child(node) and self._has_right_child(node)
def _has_child(self, node):
"""
@description:是否有子节点
"""
return self._has_left_child(node) or self._has_right_child(node)
def _has_left_child(self, node):
"""
@description:是否有左子节点
"""
return node and node.left != None
def _has_right_child(self, node):
"""
@description:是否有右子节点
"""
return node and node.right != None
def __repr__(self):
return f"{self.root}"
if __name__ == '__main__':
t = BST([12,16,1,4,8,11,14,21,17,16,15,18])
print(t)
print(t.get_height())
print(t.get_size())
print(t.constains(2))
print(t.constains(23))
print(t.constains(12))
print(t.constains(15))
print(t.constains(19))
t.layer_order()
t.pre_order()
t.mid_order()
t.next_order()
t.remove(12)
t.layer_order()
t.clear()
print(t.is_empty())
t.add(5)
t.add(9)
t.add(3)
t.add(10)
t.add(6)
t.layer_order()
哈希表
一般是将数组和红黑树(或者链表之类的)进行结合
- 添加元素时,首先通过哈希函数进行计算后,将键和值添加到对应地址的红黑树(链表)上,假如发生hash冲突时,则在对应地址的红黑树(链表)上再添加一个新的节点
- 查询时查找对应hash值地址上的红黑树(链表)节点,找到指定值时返回
- 删除时查找对应hash值地址上的红黑树(链表)节点,将该节点删除
简单实现
这里基于动态数组+双向链表实现一个简单的hashmap:
class HashMap:
"""
@description: 基于动态数组 + 链表实现的简单版hashmap
"""
class Node:
"""
@description: 链表节点,包含key、value、上一个节点和下一个节点
"""
def __init__(self, k, v, prev):
self.k = k
self.v = v
self.prev = prev
self.next = None
def has_next(self):
return self.next != None
def __init__(self):
"""
@description: 初始化数组size为16,数组的每个位置存放一个根节点
"""
self.max_size = 2 << 3
self.size = 0
self.map = [self.Node(None, None, None) for each in range(self.max_size)]
def put(self, k, v):
"""
@description: 添加操作,找到添加的节点,并将值添加进去
"""
self._max_size_check()
node = self._find(k, add=True)
node.k = k
node.v = v
def get(self, k):
"""
@description: 获取key对应value,没有就返回None
"""
node = self._find(k)
if node == None:
return None
return node.v
def remove(self, k):
"""
@description: 删除指定key的value
"""
node = self._find(k)
if node == None:
return False
node.prev.next = node.next
self._after_remove(k)
return True
def _after_remove(self, k):
"""
@description: 删除后处理-如果当前位置只剩下根节点,则size减1,并且判断是否需要缩容
"""
h = self._get_hash(k)
root = self.map[h]
if not root.has_next():
self.size -= 1
self._min_size_check()
def _find(self, k, add=False):
"""
@description: 根据key寻找节点,add代表添加操作时的查找逻辑
- add:True(不存在则创建一个节点,如果根节点下没有子节点,则说明数组当前位置被使用,size加1)
"""
h = self._get_hash(k)
root = self.map[h]
if root.next == None:
if add:
self.size += 1
root.next = self.Node(None, None, root)
return root.next
while root.has_next():
root = root.next
if root.k == k:
return root
if not root.has_next():
if not add:
return None
root.next = self.Node(None, None, root)
return root.next
return root
def _get_hash(self, k):
"""
@description: 计算hash值对应地址
"""
return hash(k) % self.max_size
def _max_size_check(self):
"""
@description: 检查size如果超过数组长度的2/3时,则对数组进行扩容,并重新添加进去
"""
if self.size <= self.max_size / 3 * 2:
return
self.max_size <<= 1
self._copy_map()
def _min_size_check(self):
"""
@description: 检查删除后size如果小于数组长度的1/5时,则对数组进行缩容,并重新添加进去
"""
if self.size >= self.max_size // 5:
return
self.max_size >>= 1
self._copy_map()
def _copy_map(self):
"""
@description: 重新拷贝map
"""
self.size = 0
t_map = self.map.copy()
self.map = [self.Node(None, None, None) for each in range(self.max_size)]
for root in t_map:
while root.next != None:
root = root.next
self.put(root.k, root.v)
def __str__(self):
"""
@description: 打印hashmap
"""
s = "-" * 100 + "\n"
for index, root in enumerate(self.map):
s += f"{index:>0{len(str(self.max_size))}} -> "
while root.next != None:
root = root.next
s += f"{root.k}:{root.v} -> "
s = s[:-4]
s += "\n"
s += "-" * 100
return s
if __name__ == "__main__":
hashmap = HashMap()
hashmap.put("a", 1)
hashmap.put("ac", 2)
hashmap.put("d", 100)
hashmap.put("aaa", 15)
print(hashmap)
print(hashmap.get("aaa"))
print(hashmap.remove("aaa"))
print(hashmap.get("aaa"))
print(hashmap.remove("add"))
hashmap.put("acd", 21)
hashmap.put("dcsa", 13)
hashmap.put("ddascsa", 53)
hashmap.put("dcgfdsa", 34)
hashmap.put("dgfdgdcsa", 33)
hashmap.put("dgfdgdcsa", 35)
print(hashmap)
hashmap.remove("acd")
hashmap.remove("dcsa")
hashmap.remove("ddascsa")
hashmap.remove("dcgfdsa")
print(hashmap)
堆
一种树形数据结构,其中较为常见的就是二叉堆,其根据自身特性主要又分为大顶堆和小顶堆(大顶堆:所有节点的值都比自身子节点的值要大,小顶堆则是子节点的值都比自身大)。由于堆的性质,很适合在一些取最值的场景当前使用(堆顶必然是最大值/最小值)
大顶堆简单实现
class MaxHeap:
"""
@description: 大顶堆(改成小顶堆只需要将compare方法中的比较方式进行修改即可)
"""
def __init__(self, eles=[]):
self._hsize = 8
self._set_size(0)
self._heap = [None] * self._hsize
# 当传入数组时,进行批量建堆
if eles:
# 批量建堆三种方式:
# 第一种:遍历添加(效率最低)
# for ele in eles:
# self.add(ele)
# --------------------------
# 第二/三种:自上而下的上滤/自下而上的下滤
self._hsize = len(eles)
self._set_size(self._hsize)
self._heap = eles
self.heapify()
def heapify(self):
"""
@description: 批量建堆
"""
# 自上而下的上滤
# for i in range(self.get_size()):
# self._sift_up(i)
# ---------------------------------
# 自下而上的上滤(效率相对更高)
for i in range(self.get_size() >> 1 - 1, -1, -1):
self._sift_down(i)
def get(self):
"""
@description: 获取堆顶元素 - 最大值
"""
self._empty_check()
return self._heap[self._root]
def add(self, v):
"""
@description: 往堆中添加一个元素
+ 检查:传入的为可比较元素、尺寸不足时动态扩容
+ 步骤:将新添的元素放入尾部,然后进行上滤操作
"""
self._is_comparable(v)
self._set_size(self.get_size() + 1)
self._size_check()
self._heap[self.get_size() - 1] = v
self._sift_up(self.get_size() - 1)
def remove(self):
"""
@description: 删除堆顶元素
+ 检查:堆不能为空
+ 步骤:将尾部元素放到头部,尺寸减1,然后进行下滤操作
"""
self._empty_check()
v = self._heap[self._root]
self._heap[self._root] = self._heap[self.get_size() - 1]
self._set_size(self.get_size() - 1)
self._sift_down(self._root)
return v
def replace(self, v):
"""
@description: 将堆顶元素删除,添加一个新元素,并将删除的元素返回
"""
self._empty_check()
# 两种方式:
# 第一种:先删除后添加(实现简单,但效率较低)
# old = self.remove()
# self.add(v)
# ----------------------------------------
# 第二种:直接覆盖第一个,然后下滤
old = self._heap[self._root]
self._heap[self._root] = v
self._sift_down(self._root)
return old
def _sift_up(self, cur):
"""
@description: 上滤操作,由下至上,若父元素比子元素小,则进行交换操作
"""
while cur > self._root:
parent = self._get_parent(cur)
if self._compare(parent, cur):
break
self._swap(parent, cur)
cur = parent
def _sift_down(self, cur):
"""
@description: 下滤操作,由上至下,找出最大的直接子元素,与父元素比较大小,子元素更大则进行交换操作
"""
while cur < self.get_size():
left_child = self._get_left(cur)
if not left_child:
break
max_child = left_child
right_child = self._get_right(cur)
if right_child and self._compare(right_child, max_child):
max_child = right_child
if self._compare(cur, max_child):
break
self._swap(cur, max_child)
cur = max_child
def clear(self):
"""
@description: 清空堆
"""
self._hsize = 8
self._heap = [None] * self._hsize
self._set_size(0)
def is_empty(self):
"""
@description: 堆是否为空
"""
return self.get_size() == 0
def _empty_check(self):
"""
@description: 堆必须非空才能进行读取和删除堆顶操作
"""
if self.is_empty():
raise Exception("当前堆为空!")
def _size_check(self):
"""
@description: 这里模拟动态数组,当size不够时动态扩容
"""
if self.get_size() < self._hsize:
return
self._hsize = int(1.5 * self._hsize)
t_heap = self._heap
tsize = self.get_size()
self._heap = [None] * self._hsize
for i in range(tsize):
self._heap[i] = t_heap[i]
def _is_comparable(self, v):
"""
@description: 判断传入的元素是否为可比较类型
"""
if not type(v) in [int, float]:
raise Exception(f"传入类型必须为int/float型,你传入的是:{type(v)}")
def _compare(self, parent, children):
"""
@description: 大顶堆,所以判断的条件是父节点不小于子节点,小顶堆只需要把这里的比较改为不大于即可
"""
return self._heap[parent] >= self._heap[children]
def _swap(self, e1, e2):
"""
@description: 交换堆中的两个节点
"""
self._heap[e1], self._heap[e2] = self._heap[e2], self._heap[e1]
def _set_size(self, size):
"""
@description: 设置堆尺寸
"""
self._size = size
def get_size(self):
"""
@description: 获取堆中元素数量
"""
return self._size
def get_height(self):
"""
@description: 获取堆高度
"""
height = 0
sub = self.get_size()
while sub > 0:
sub >>= 1
height += 1
return height
@property
def _root(self):
"""
@description: 返回根节点索引 - 0
"""
return 0
def _get_parent(self, cur):
"""
@description: 获取父节点索引 - floor((i-1)/2)
"""
return (cur - 1) // 2
def _get_left(self, cur):
"""
@description: 获取左子节点索引 - 2*i+1
"""
left = 2 * cur + 1
return left if left < self.get_size() else None
def _get_right(self, cur):
"""
@description: 获取右子节点索引 - 2*i+2
"""
right = 2 * cur + 2
return right if right < self.get_size() else None
def __str__(self):
"""
@description: 打印堆数组
"""
return str(self._heap[:self.get_size()])
def graph(self):
"""
@description: 以树形方式打印堆
"""
def get_cur_layer(layer):
return 2**(height - layer) - 1
layer = 1
height = self.get_height()
max_len = max(map(lambda ele: len(str(ele)) if ele else 0, self._heap))
gap = " " * max_len
print("-" * (get_cur_layer(layer) * 5))
print(gap * (get_cur_layer(layer)), end="")
for i in range(1, self.get_size() + 1):
print(f"{self._heap[i - 1]:>{max_len}}", end=gap * ((2 ** (height - layer + 1)) - 1))
if 2 ** layer - 1 == i:
print(gap * (get_cur_layer(layer)))
layer += 1
if height >= layer:
print(gap * (get_cur_layer(layer)), end="")
print()
print("-" * (get_cur_layer(layer) * 5))
if __name__ == '__main__':
h = MaxHeap([5,7,3,8,6,12,54,3,0,9,32,41])
print(h)
h.graph()
h.remove()
h.graph()
h.replace(10)
h.graph()
h.add(30)
h.add(100)
h.graph()
h.clear()
print(h.is_empty())
print(h.is_empty())
h.add(1)
print(h.get_size())
print(h)
Trie
又称字典树、前缀树,其搜索效率只和字符串的长度有关,因此能够十分高效地实现前缀搜索功能。其每个节点只存放对应长度位置的字符,例如存放单词:["abc", "acc", "bca"]
,那么就按如下方式存储:
root
/ \
a b
/ \ |
b c c
| | |
c c a
此时假如搜索单词abc
,那么就会先搜索根下是否存在值为a
的节点,如果存在则进入a
节点,继续寻找是否存在值为b
的节点,...,只要期间检索失败,则返回False
简单实现
class Trie:
class Node:
def __init__(self, value, parent):
"""
@description: 每个节点的基本信息:
- value:节点的value
- parent: 节点的父节点
- childrens: 所有子节点
- word: 是否为完整的一个单词
"""
self.value = value
self.childrens = {}
self.parent = parent
self.word = False
def get_childrens(self):
"""
@description: 获取所有子节点
"""
return self.childrens
def __repr__(self):
return f"{self.value}"
def __init__(self):
self.size = 0
self.root = self.Node(None, None)
def is_empty(self):
return self.size == 0
def clear(self):
"""
@description: 清空操作,直接清空根节点的子节点即可
"""
self.size = 0
self.root.get_childrens().clear()
def _get_v(self, v):
"""
@description: 返回指定节点的value
"""
return self._find(v).value
def constains(self, v):
"""
@description: 是否包含某个单词
"""
node = self._find(v)
return node != None and node.word
def _find(self, v):
"""
@description: 寻找某个节点
"""
self._v_check(v)
root = self.root.get_childrens()
for s in v:
node = root.get(s)
if not node:
return None
root = root.get(s).get_childrens()
return node
def _v_check(self, v):
"""
@description: 传入字符检查
"""
if not (isinstance(v, str) and v != ""):
raise Exception("传入的内容必须为非空字符串")
def add(self, v):
"""
@description: 添加一个单词,依次遍历字母找节点,如果不存在则创建节点,当遍历完以后,在最后一个节点设置word为True
"""
self._v_check(v)
root = self.root
for i, s in enumerate(v):
children = root.get_childrens()
node = children.get(s)
if not node:
children[s] = self.Node(v[:i+1], root)
node = children[s]
root = node
root.word = True
def remove(self, v):
"""
@description: 删除某个单词,如果单词存在,则找到该单词的节点,如果该节点存在子节点,则仅把word设为False;如果不存在子节点,则直接删除该节点,然后再对其父节点进行相同操作
"""
node = self._find(v)
if not node:
return
node.word = False
if node.get_childrens():
return
parent = node.parent
while parent != None:
children = parent.get_childrens()
children.pop(node.value)
if children:
break
node = parent
parent = parent.parent
def start_with(self, v):
"""
@description: 判断是否存在以指定字符开头的内容
"""
return self._find(v) != None
def __str__(self):
output = ""
stack = [(0, self.root)]
layer = 0
while len(stack) > 0:
l, node = stack.pop(0)
if l > layer:
output += "\n"
layer = l
childrens = node.get_childrens().values()
if not childrens:
continue
for c in node.get_childrens().keys():
output += f'{c}(parent: {node} | word:{node.get_childrens().get(c).word})\t'
stack.extend(zip([l + 1] *len(childrens), childrens))
return output
if __name__ == '__main__':
t = Trie()
t.add("a")
t.add("gs")
t.add("acd")
t.add("ab")
t.add("cba")
t.add("aaaa")
t.add("bcd")
print(t)
print(t.constains("a"))
print(t.constains("ab"))
print(t.start_with("ccc"))
t.remove("a")
print(t.start_with("ab"))
print(t.constains("acd"))
print(t.constains("a"))
图
有向图简单实现
class Graph:
class Vertex:
"""
@description:顶点,包括顶点值、入度边和出度边
"""
def __init__(self, v):
self.val = v
self.in_edges = []
self.out_edges = []
def __str__(self):
in_e = ["{}({})".format(e.fro, e.w) for e in self.in_edges]
out_e = ["{}({})".format(e.to, e.w) for e in self.out_edges]
return f"{in_e} -> {self.val} -> {out_e}".replace("'", "")
class Edge:
"""
@description:边,包括指向和权值
"""
def __init__(self, fro, to, w=0):
self.fro = fro
self.to = to
self.w = w
def __eq__(self, edge):
return (self.fro, self.to, self.w) == (edge.fro, edge.to, edge.w)
def __hash__(self):
return hash((self.fro, self.to, self.w))
def __init__(self, vertexs=[], edges=[]):
"""
@description:初始化变量存放所有的顶点和边
"""
self._vertexs = {}
self._edges = set()
if vertexs:
for vertex in vertexs:
self.add_vertex(vertex)
if edges:
for edge in edges:
self.add_edge(*edge)
@property
def vertex_size(self):
"""
@description:顶点数
"""
return len(self._vertexs)
@property
def edge_size(self):
"""
@description:边数
"""
return len(self._edges)
def add_vertex(self, v):
"""
@description:添加顶点,如果顶点已存在则不操作
"""
if not v in self._vertexs:
self._vertexs[v] = self.Vertex(v)
return self._vertexs[v]
def add_edge(self, fro, to, w=0):
"""
@description:添加边,首先要确保有对应顶点,然后才能添加边,并且边不能重复
"""
# 如果边存在,则不重复添加
edge = self.Edge(fro, to, w)
if edge in self._edges:
return
# 确保顶点都存在,不存在就添加
self.add_vertex(fro)
self.add_vertex(to)
# 获取顶点
fro_vertex = self._vertexs.get(fro)
to_vertex = self._vertexs.get(to)
# 根据顶点和权值创建边
# 分别往两个顶点的出度边和入度边里添加当前边
fro_vertex.out_edges.append(edge)
to_vertex.in_edges.append(edge)
self._edges.add(edge)
def remove_vertex(self, v):
"""
@description:删除顶点,步骤:
找到当前顶点的所有出度边,那么这些边的出度顶点里的入度边里肯定存在这些边,将这些边全部删除;
同理找当前顶点入度边,在入度边的入度顶点里的出度边里删除这些边;
最后删除当前顶点
"""
if not v in self._vertexs:
return
redges = set()
rv = self._vertexs.pop(v)
for e in rv.out_edges:
vv = self._vertexs[e.to]
vv.in_edges.remove(e)
redges.add(e)
for e in rv.in_edges:
vv = self._vertexs[e.fro]
vv.out_edges.remove(e)
redges.add(e)
for e in redges:
self._edges.remove(e)
def remove_edge(self, fro, to, w=0):
"""
@description:删除边,需要删除在对应顶点存在的出度边、入度边,并将边的记录删除
"""
edge = self.Edge(fro, to, w)
if not edge in self._edges:
return
fro_vertex = self._vertexs.get(fro)
to_vertex = self._vertexs.get(to)
tedge = self.Edge(fro, to, w)
redge = None
for oedge in fro_vertex.out_edges:
if tedge == oedge:
redge = oedge
break
if redge:
fro_vertex.out_edges.remove(redge)
to_vertex.in_edges.remove(redge)
self._edges.remove(redge)
def graph(self):
"""
@description:边,包括指向和权值
"""
print("-"*100)
for v in self._vertexs.values():
print(v)
print("-"*100)
if __name__ == '__main__':
vertixs = (0,1,2,3,4,5)
edges = ((0, 1, 10), (0, 2, 3), (1, 5), (1, 3), (2, 5, 1))
g = Graph(vertixs, edges)
g.graph()
g.add_edge(1, 2, 3)
g.add_edge(0, 1, 10)
g.add_edge(1, 0, 10)
g.add_edge(2, 8, 1)
g.add_vertex(10)
g.graph()
g.remove_edge(0, 1, 10)
g.remove_edge(0, 1, 9)
g.graph()
g.remove_vertex(0)
g.graph()
print(g.vertex_size, g.edge_size)
广度优先搜索
参考树的层序遍历
深度优先搜索
参考树的前序遍历
并查集
适合多个集合,并判断某个元素是否在某个集合的场景
快查型实现
每个元素都指向其根节点(相当于只有一层),所以查询很快(直接返回对应索引即可,O(1));合并时将所有根节点相同的都更新,所以更新时需要遍历元素(O(n))
class UnionFind(object):
def __init__(self, l):
self.uset = [i for i in range(l + 1)]
def find(self, e):
return self.uset[e]
def union(self, e1, e2):
v1 = self.find(e1)
v2 = self.find(e2)
if v1 == v2:
return
for i, e in enumerate(self.uset):
if e == v1:
self.uset[i] = v2
def judge(self, e1, e2):
return self.find(e1) == self.find(e2)
if __name__ == "__main__":
uf = UnionFind(10)
uf.union(1, 2)
uf.union(1, 3)
uf.union(1, 5)
uf.union(6, 7)
uf.union(6, 8)
uf.union(4, 9)
uf.union(4, 10)
print(uf.judge(4, 9))
print(uf.judge(10, 9))
print(uf.judge(1, 9))
uf.union(1, 9)
print(uf.judge(1, 9))
print(uf.judge(10, 9))
print(uf.uset)
快合并型
查询时直接找到元素的根节点(父元素为自身);合并时找到两个元素的根节点进行判断,如果根节点不同则仅将根节点进行替换,所以合并复杂度基于查询复杂度
class UnionFind(object):
def __init__(self, l):
self.uset = [i for i in range(l + 1)]
def find(self, e):
"""找到根节点"""
while self.uset[e] != e:
e = self.uset[e]
return e
def union(self, e1, e2):
v1 = self.find(e1)
v2 = self.find(e2)
if v1 == v2:
return
self.uset[v2] = v1
# 将根节点替换
def judge(self, e1, e2):
return self.find(e1) == self.find(e2)
if __name__ == "__main__":
uf = UnionFind(10)
uf.union(1, 2)
uf.union(1, 3)
uf.union(1, 5)
uf.union(6, 7)
uf.union(6, 8)
uf.union(4, 9)
uf.union(4, 10)
print(uf.judge(4, 9))
print(uf.judge(10, 9))
print(uf.judge(1, 9))
uf.union(1, 9)
print(uf.judge(1, 9))
print(uf.judge(10, 9))
print(uf.uset)
布隆过滤器
一个能够提供高效存储和查询的数据结构,其能够告诉你某个数据一定不存在或者可能存在
优缺点
优点
- 优化了空间上的利用率
- 能够保持高效的查询和添加效率
缺点
- 不能保证结果一定准确(返回
True
未必存在,但是返回False
一定不存在)
适合场景
海量数据,并且允许有一定的容错率,例如大批量分布式爬虫,如果使用hash进行存储,虽然时间复杂度为O(1)
级别,但是hash表为了减少hash冲突,必然会有存储稀疏的特点,此时如果数据量过大,则可能导致内存不足的问题。而布隆过滤器则可以在一定程度上优化空间问题,通过设计一个没那么密集的数据结构来存储数据,并且保证了查询和添加的效率不会太低(取决于hash函数的数量以及hash函数的复杂度)
原理
存储方式
通过设计一个十分长的二进制字符,并假设有n个hash函数,那么对每个添加的数,都会算出几个hash函数对应的值,然后在二进制字符上将这些值的对应位置置为1
校验方式
检查是否存在时,就会检查这个添加的数的几个hash函数值位置是否都为1,是则存在,不是则不存在。由此可以得出:只要有一个位置不为1,肯定是不存在,但是全部为1也未必存在,因为可能别的算出来的hash函数值存在一样的,即hash冲突造成的位置为1
简单实现
import math
import random
from bitarray import bitarray
# 二进制数据类
class BloomFilter:
def __init__(self, n, p):
"""
@desription:初始化根据数据规模和容错率生成对应尺寸的布隆过滤器
@params:
- n:数据规模
- p:误判率(0~1)
"""
if n <= 0 or not (0 < p < 1):
raise Exception("请保证数据规模大于0,且误判率在0~1之间")
self.size = int(-1*(n * math.log(p)) / (math.log(2) ** 2))
# 布隆过滤器尺寸
self.bit = bitarray(self.size)
# 存放二进制数据的数组
self.bit.setall(0)
# 初始化全部位置置0
self.hash_size = int(self.size / n * math.log(2))
# hash函数个数
self.hash_funs = self.get_hash_funs(self.hash_size)
# 需要执行的hash函数
print(f"初始化完成!(数据规模:{n},容错率:{p*100}%,尺寸:{self.size},hash函数个数:{self.hash_size})")
def add(self, v):
"""
@desription:添加一个元素
"""
self.check(v)
indexs = self.getIndex(v)
for index in indexs:
self.setOne(index)
return indexs
def contain(self, v):
"""
@desription:判断一个元素是否存在,返回True代表可能存在,返回False代表一定不存在
"""
self.check(v)
indexs = self.getIndex(v)
return all(map(lambda index: self.bit[index] == 1, indexs))
def get_hash_funs(self, n):
"""
@desription:返回n个hash函数
"""
def hash_fun(n):
def my_hash(value):
s = str({"n": n, "value": value})
return hash(s)
return my_hash
return [hash_fun(ni) for ni in range(n)]
def check(self, v):
"""
@desription:判断数据是否为空
"""
if v == None:
raise Exception("传入数据不能为空")
def getIndex(self, v):
"""
@desription:判断通过hash函数计算后的所有索引
"""
return [hash_fun(v) % self.size for hash_fun in self.hash_funs]
def setOne(self, index):
"""
@desription:将对应位置值置1
"""
self.bit[index] = 1
def count_acc(n, rate):
"""
@desription:计算精确度,n为数据规模,rate为容错率
"""
b = BloomFilter(n, rate)
test_data = set()
while len(test_data) < n * 2:
test_data.add(random.randint(0, n * 100))
test_data = list(test_data)
for data in test_data[:n]:
b.add(data)
print(f"错误率:{sum([b.contain(data) for data in test_data[n:]]) / n * 100}%")
if __name__ == '__main__':
count_acc(1000, 0.01)
# 结果:
# 初始化完成!(数据规模:1000,容错率:1.0%,尺寸:9585,hash函数个数:6)
# 错误率:0.8%
运行后会发现错误率基本在设定的值左右,如果结果偏差较大,可能是因为以下原因:
- hash函数设计的不够好(要尽可能的达到每个hash函数算出的值都不冲突)
- ...
其他
删除功能
布隆过滤器一般不提供删除功能,因为如果将指定位置的值置零,可能别的也有hash指向这里,就容易出问题。如果一定要实现,可以通过引用计数方式(不是通过0
/1
进行存储,而是每当有一个指向,则加1,删除时进行减1即可),但是这样每个位置都变成int
型,其也就失去了空间小的优势了
一致性哈希
理论参考:https://www.jianshu.com/p/735a3d4789fc
class HashCircle:
"""哈希环"""
def __init__(self, vnode_num=3):
# 虚拟节点数量
self.vnode_num = 3
# 排序后的虚拟节点
self.sort_vnodes = []
# 虚拟节点与真实节点的映射
self.vnode_map = {}
def add(self, node):
"""添加一个节点"""
self.check_node(node)
for i in range(self.vnode_num):
vnode = self.get_hash(str(node) + str(i))
self.sort_vnodes.append(vnode)
self.vnode_map[vnode] = node
# 添加完成后需要对节点进行排序
self.sort_vnodes.sort()
def delete(self, node):
"""删除一个节点"""
self.check_node(node)
remove_vns = []
for vn, n in self.vnode_map.items():
if n is not node:
continue
if vn in self.sort_vnodes:
self.sort_vnodes.remove(vn)
remove_vns.append(vn)
for vn in remove_vns:
del self.vnode_map[vn]
@staticmethod
def check_node(node):
"""检查是否为节点类型"""
assert isinstance(node, Node), "必须传入Node类或其子类节点!"
@staticmethod
def get_hash(key):
"""节点哈希运算"""
return hash(key)
def find_node(self, key):
"""寻找key所在节点"""
if not self.sort_vnodes:
return
khash = self.get_hash(key)
knode = self.sort_vnodes[0]
for vn in self.sort_vnodes:
if vn > khash:
knode = vn
break
return self.vnode_map[knode]
def get_key(self, key):
"""寻找指定节点,并取出对应值"""
knode = self.find_node(key)
if not knode:
return
print(f"查询key:{key},所在节点:{knode},value:{knode.get(key, None)}")
return knode.get(key, None)
def set_key(self, key, value):
knode = self.find_node(key)
if not knode:
return
knode.set(key, value)
class Node:
"""模拟一个查询节点"""
def __init__(self, ip):
self.ip = ip
self.map = {}
def set(self, key, value):
self.map[key] = value
def get(self, key, default=None):
return self.map.get(key, default)
def __str__(self):
return str(self.ip)
def test():
hc = HashCircle()
# 创建三个节点
node1 = Node("192.168.0.1")
node2 = Node("192.168.0.2")
node3 = Node("192.168.0.3")
# 添加到哈希环中
hc.add(node1)
hc.add(node2)
hc.add(node3)
# 将数据添加进节点
for i in range(10):
hc.set_key(f"k{i}", i)
# 查看结果
for i in range(10):
hc.get_key(f"k{i}")
# 删除其中一个节点后查看结果
print("------------------------------------------")
hc.delete(node3)
for i in range(10):
hc.get_key(f"k{i}")
if __name__ == "__main__":
test()