I think a lot of new programmers like to use advanced data structures and advanced language features as a way of demonstrating their ability. I call it the lion-tamer syndrome. Such demonstrations are impressive, but unless they actually translate into real wins for the project, avoid them. - Glyn Williams’
最近重新复习下常用算法和数据结构,下边是看该书大致记的笔记和代码,纯手敲,有兴趣的可以参考下,都是基于python3.5 class实现,所以也算是复习了一下OOP。虽然思想都是通用的,但是使用的语言也会影响到我们的思维,我会结合一下使用python的经验来稍微分析下各种数据结构和其操作的复杂度,以便灵活选用。实现一个数据结构后最好写一些单元测试用例,否则没人知道你写的究竟对不对。坑爹的是本书有很多代码错误甚至实现错误,调试花了我很多时间。同时你还会发现很多坑爹的网络算法教程文章代码直接拷贝根本不能用,没有单元测试证明算法正确性的都是扯淡。
1章:ADT抽象数据类型,定义数据和其操作
什么是ADT: 抽象数据类型,学过数据结构的应该都知道。
How to select datastructures for ADT
- Dose the data structure provie for the storage requirements as specified by the domain of the ADT?
- Does the data structure provide the data access and manipulation functionality to fully
implement the ADT? - Effcient implemention? based on complexity analysis.
下边代码是个简单的示例,比如实现一个简单的Bag类,先定义其具有的操作,然后我们再用类的magic method来实现这些方法:
class Bag:
"""
constructor: 构造函数
size
contains
append
remove
iter
"""
def __init__(self):
self._items = list()
def __len__(self):
return len(self._items)
def __contains__(self, item):
return item in self._items
def add(self, item):
self._items.append(item)
def remove(self, item):
assert item in self._items, 'item must in the bag'
return self._items.remove(item)
def __iter__(self):
return _BagIterator(self._items)
class _BagIterator:
""" 注意这里实现了迭代器类 """
def __init__(self, seq):
self._bag_items = seq
self._cur_item = 0
def __iter__(self):
return self
def __next__(self):
if self._cur_item < len(self._bag_items):
item = self._bag_items[self._cur_item]
self._cur_item += 1
return item
else:
raise StopIteration
b = Bag()
b.add(1)
b.add(2)
for i in b: # for使用__iter__构建,用__next__迭代
print(i)
"""
# for 语句等价于
i = b.__iter__()
while True:
try:
item = i.__next__()
print(item)
except StopIteration:
break
"""
2章:array vs list
array: 定长,操作有限,但是节省内存;貌似我的生涯中还没用过,不过python3.5中我试了确实有array类,可以用import array直接导入
list: 会预先分配内存,操作丰富,但是耗费内存。我用sys.getsizeof做了实验。我个人理解很类似C++ STL里的vector,是使用最频繁的数据结构。
- list.append: 如果之前没有分配够内存,会重新开辟新区域,然后复制之前的数据,复杂度退化
- list.insert: 会移动被插入区域后所有元素,O(n)
- list.pop: pop不同位置需要的复杂度不同pop(0)是O(1)复杂度,pop()首位O(n)复杂度
- list[]: slice操作copy数据(预留空间)到另一个list
来实现一个array的ADT:
import ctypes
class Array:
def __init__(self, size):
assert size > 0, 'array size must be > 0'
self._size = size
PyArrayType = ctypes.py_object * size
self._elements = PyArrayType()
self.clear(None)
def __len__(self):
return self._size
def __getitem__(self, index):
assert index >= 0 and index < len(self), 'out of range'
return self._elements[index]
def __setitem__(self, index, value):
assert index >= 0 and index < len(self), 'out of range'
self._elements[index] = value
def clear(self, value):
""" 设置每个元素为value """
for i in range(len(self)):
self._elements[i] = value
def __iter__(self):
return _ArrayIterator(self._elements)
class _ArrayIterator:
def __init__(self, items):
self._items = items
self._idx = 0
def __iter__(self):
return self
def __next__(self):
if self._idex < len(self._items):
val = self._items[self._idx]
self._idex += 1
return val
else:
raise StopIteration
Two-Demensional Arrays
class Array2D:
""" 要实现的方法
Array2D(nrows, ncols): constructor
numRows()
numCols()
clear(value)
getitem(i, j)
setitem(i, j, val)
"""
def __init__(self, numrows, numcols):
self._the_rows = Array(numrows) # 数组的数组
for i in range(numrows):
self._the_rows[i] = Array(numcols)
@property
def numRows(self):
return len(self._the_rows)
@property
def NumCols(self):
return len(self._the_rows[0])
def clear(self, value):
for row in range(self.numRows):
row.clear(value)
def __getitem__(self, ndx_tuple): # ndx_tuple: (x, y)
assert len(ndx_tuple) == 2
row, col = ndx_tuple[0], ndx_tuple[1]
assert (row >= 0 and row < self.numRows and
col >= 0 and col < self.NumCols)
the_1d_array = self._the_rows[row]
return the_1d_array[col]
def __setitem__(self, ndx_tuple, value):
assert len(ndx_tuple) == 2
row, col = ndx_tuple[0], ndx_tuple[1]
assert (row >= 0 and row < self.numRows and
col >= 0 and col < self.NumCols)
the_1d_array = self._the_rows[row]
the_1d_array[col] = value
The Matrix ADT, m行,n列。这个最好用还是用pandas处理矩阵,自己实现比较*疼
class Matrix:
""" 最好用pandas的DataFrame
Matrix(rows, ncols): constructor
numCols()
getitem(row, col)
setitem(row, col, val)
scaleBy(scalar): 每个元素乘scalar
transpose(): 返回transpose转置
add(rhsMatrix): size must be the same
subtract(rhsMatrix)
multiply(rhsMatrix)
"""
def __init__(self, numRows, numCols):
self._theGrid = Array2D(numRows, numCols)
self._theGrid.clear(0)
@property
def numRows(self):
return len(self._theGrid.numRows())
@property
def NumCols(self):
return len(self._theGrid.numCols())
def __getitem__(self, ndxTuple):
return self._theGrid[ndxTuple[0], ndxTuple[1]]
def __setitem__(self, ndxTuple, scalar):
self._theGrid[ndxTuple[0], ndxTuple[1]] = scalar
def scaleBy(self, scalar):
for r in range(self.numRows):
for c in range(self.numCols):
self[r, c] *= scalar
def __add__(self, rhsMatrix):
assert (rhsMatrix.numRows == self.numRows and
rhsMatrix.numCols == self.numCols)
newMartrix = Matrix(self.numRows, self.numCols)
for r in range(self.numRows):
for c in range(self.numCols):
newMartrix[r, c] = self[r, c] + rhsMatrix[r, c]
3章:Sets and Maps
除了list之外,最常用的应该就是python内置的set和dict了。
sets ADT
A set is a container that stores a collection of unique values over a given comparable domain in which the stored values have no particular ordering.
class Set:
""" 使用list实现set ADT
Set()
length()
contains(element)
add(element)
remove(element)
equals(element)
isSubsetOf(setB)
union(setB)
intersect(setB)
difference(setB)
iterator()
"""
def __init__(self):
self._theElements = list()
def __len__(self):
return len(self._theElements)
def __contains__(self, element):
return element in self._theElements
def add(self, element):
if element not in self:
self._theElements.append(element)
def remove(self, element):
assert element in self, 'The element must be set'
self._theElements.remove(element)
def __eq__(self, setB):
if len(self) != len(setB):
return False
else:
return self.isSubsetOf(setB)
def isSubsetOf(self, setB):
for element in self:
if element not in setB:
return False
return True
def union(self, setB):
newSet = Set()
newSet._theElements.extend(self._theElements)
for element in setB:
if element not in self:
newSet._theElements.append(element)
return newSet
Maps or Dict: 键值对,python内部采用hash实现。
class Map:
""" Map ADT list implemention
Map()
length()
contains(key)
add(key, value)
remove(key)
valudOf(key)
iterator()
"""
def __init__(self):
self._entryList = list()
def __len__(self):
return len(self._entryList)
def __contains__(self, key):
ndx = self._findPosition(key)
return ndx is not None
def add(self, key, value):
ndx = self._findPosition(key)
if ndx is not None:
self._entryList[ndx].value = value
return False
else:
entry = _MapEntry(key, value)
self._entryList.append(entry)
return True
def valueOf(self, key):
ndx = self._findPosition(key)
assert ndx is not None, 'Invalid map key'
return self._entryList[ndx].value
def remove(self, key):
ndx = self._findPosition(key)
assert ndx is not None, 'Invalid map key'
self._entryList.pop(ndx)
def __iter__(self):
return _MapIterator(self._entryList)
def _findPosition(self, key):
for i in range(len(self)):
if self._entryList[i].key == key:
return i
return None
class _MapEntry: # or use collections.namedtuple('_MapEntry', 'key,value')
def __init__(self, key, value):
self.key = key
self.value = value
The multiArray ADT, 多维数组,一般是使用一个一维数组模拟,然后通过计算下标获取元素
class MultiArray:
""" row-major or column-marjor ordering, this is row-major ordering
MultiArray(d1, d2, ...dn)
dims(): the number of dimensions
length(dim): the length of given array dimension
clear(value)
getitem(i1, i2, ... in), index(i1,i2,i3) = i1*(d2*d3) + i2*d3 + i3
setitem(i1, i2, ... in)
计算下标:index(i1,i2,...in) = i1*f1 + i2*f2 + ... + i(n-1)*f(n-1) + in*1
"""
def __init__(self, *dimensions):
# Implementation of MultiArray ADT using a 1-D # array,数组的数组的数组。。。
assert len(dimensions) > 1, 'The array must have 2 or more dimensions'
self._dims = dimensions
# Compute to total number of elements in the array
size = 1
for d in dimensions:
assert d > 0, 'Dimensions must be > 0'
size *= d
# Create the 1-D array to store the elements
self._elements = Array(size)
# Create a 1-D array to store the equation factors
self._factors = Array(len(dimensions))
self._computeFactors()
@property
def numDims(self):
return len(self._dims)
def length(self, dim):
assert dim > 0 and dim < len(self._dims), 'Dimension component out of range'
return self._dims[dim-1]
def clear(self, value):
self._elements.clear(value)
def __getitem__(self, ndxTuple):
assert len(ndxTuple) == self.numDims, 'Invalid # of array subscripts'
index = self._computeIndex(ndxTuple)
assert index is not None, 'Array subscript out of range'
return self._elements[index]
def __setitem__(self, ndxTuple, value):
assert len(ndxTuple) == self.numDims, 'Invalid # of array subscripts'
index = self._computeIndex(ndxTuple)
assert index is not None, 'Array subscript out of range'
self._elements[index] = value
def _computeIndex(self, ndxTuple):
# using the equation: i1*f1 + i2*f2 + ... + in*fn
offset = 0
for j in range(len(ndxTuple)):
if ndxTuple[j] < 0 or ndxTuple[j] >= self._dims[j]:
return None
else:
offset += ndexTuple[j] * self._factors[j]
return offset
4章:Algorithm Analysis
一般使用大O标记法来衡量算法的平均时间复杂度, 1 < log(n) < n < nlog(n) < n^2 < n^3 < a^n。
了解常用数据结构操作的平均时间复杂度有利于使用更高效的数据结构,当然有时候需要在时间和空间上进行衡量,有些操作甚至还会退化,比如list的append操作,如果list空间不够,会去开辟新的空间,操作复杂度退化到O(n),有时候还需要使用均摊分析(amortized)
5章:Searching and Sorting
排序和查找是最基础和频繁的操作,python内置了in操作符和bisect二分操作模块实现查找,内置了sorted方法来实现排序操作。二分和快排也是面试中经常考到的,本章讲的是基本的排序和查找。
def binary_search(sorted_seq, val):
""" 实现标准库中的bisect.bisect_left """
low = 0
high = len(sorted_seq) - 1
while low <= high:
mid = (high + low) // 2
if sorted_seq[mid] == val:
return mid
elif val < sorted_seq[mid]:
high = mid - 1
else:
low = mid + 1
return low
def bubble_sort(seq): # O(n^2), n(n-1)/2 = 1/2(n^2 + n)
n = len(seq)
for i in range(n-1):
for j in range(i+n-1): # 每一轮冒泡如果满足条件交换相邻的元素
if seq[j] > seq[j+1]:
seq[j], seq[j+1] = seq[j+1], seq[j] # swap seq[j], seq[j+1]
# 冒泡实际上可以优化,设置一个flag,如果有一轮没有交换操作就说明已经有序了
def select_sort(seq):
"""可以看作是冒泡的改进,每次找一个最小的元素交换,每一轮只需要交换一次"""
n = len(seq)
for i in range(n-1):
min_idx = i # assume the ith element is the smallest
for j in range(i+1, n):
if seq[j] < seq[min_idx]: # find the minist element index
min_idx = j
if min_idx != i: # swap
seq[i] = seq[min_idx]
def insertion_sort(seq):
""" 每次挑选下一个元素插入已经排序的数组中,初始时已排序数组只有一个元素"""
n = len(seq)
for i in range(1, n):
value = seq[i] # save the value to be positioned
# find the position where value fits in the ordered part of the list
pos = i
while pos > 0 and value < seq[pos-1]:
# Shift the items to the right during the search
seq[pos] = seq[pos-1]
pos -= 1
seq[pos] = value
def merge_sorted_list(listA, listB):
""" 归并两个有序数组 """
new_list = list()
a = b = 0
while a < len(listA) and b < len(listB):
if listA[a] < listB[b]:
new_list.append(listA[a])
a += 1
else:
new_list.append(listB[b])
b += 1
while a < len(listA):
new_list.append(listA[a])
a += 1
while b < len(listB):
new_list.append(listB[b])
b += 1
return new_list
6章:Linked Structure
list是最常用的数据结构,但是list在中间增减元素的时候效率会很低,这时候linked list会更适合,缺点就是获取元素的平均时间复杂度变成了O(n)
# 单链表实现
class ListNode:
def __init__(self, data):
self.data = data
self.next = None
def travsersal(head, callback):
curNode = head
while curNode is not None:
callback(curNode.data)
curNode = curNode.next
def unorderdSearch(head, target):
curNode = head
while curNode is not None and curNode.data != target:
curNode = curNode.next
return curNode is not None
# Given the head pointer, prepend an item to an unsorted linked list.
def prepend(head, item):
newNode = ListNode(item)
newNode.next = head
head = newNode
# Given the head reference, remove a target from a linked list
def remove(head, target):
predNode = None
curNode = head
while curNode is not None and curNode.data != target:
# 寻找目标
predNode = curNode
curNode = curNode.data
if curNode is not None:
if curNode is head:
head = curNode.next
else:
predNode.next = curNode.next
7章:Stacks
栈也是计算机里用得比较多的数据结构,栈是一种后进先出的数据结构,可以理解为往一个桶里放盘子,先放进去的会被压在地下,拿盘子的时候,后放的会被先拿出来。
class Stack:
""" Stack ADT, using a python list
Stack()
isEmpty()
length()
pop(): assert not empty
peek(): assert not empty, return top of non-empty stack without removing it
push(item)
"""
def __init__(self):
self._items = list()
def isEmpty(self):
return len(self) == 0
def __len__(self):
return len(self._items)
def peek(self):
assert not self.isEmpty()
return self._items[-1]
def pop(self):
assert not self.isEmpty()
return self._items.pop()
def push(self, item):
self._items.append(item)
class Stack:
""" Stack ADT, use linked list
使用list实现很简单,但是如果涉及大量push操作,list的空间不够时复杂度退化到O(n)
而linked list可以保证最坏情况下仍是O(1)
"""
def __init__(self):
self._top = None # top节点, _StackNode or None
self._size = 0 # int
def isEmpty(self):
return self._top is None
def __len__(self):
return self._size
def peek(self):
assert not self.isEmpty()
return self._top.item
def pop(self):
assert not self.isEmpty()
node = self._top
self.top = self._top.next
self._size -= 1
return node.item
def _push(self, item):
self._top = _StackNode(item, self._top)
self._size += 1
class _StackNode:
def __init__(self, item, link):
self.item = item
self.next = link
8章:Queues
队列也是经常使用的数据结构,比如发送消息等,celery可以使用redis提供的list实现消息队列。
本章我们用list和linked list来实现队列和优先级队列。
class Queue:
""" Queue ADT, use list。list实现,简单但是push和pop效率最差是O(n)
Queue()
isEmpty()
length()
enqueue(item)
dequeue()
"""
def __init__(self):
self._qList = list()
def isEmpty(self):
return len(self) == 0
def __len__(self):
return len(self._qList)
def enquue(self, item):
self._qList.append(item)
def dequeue(self):
assert not self.isEmpty()
return self._qList.pop(0)
from array import Array # Array那一章实现的Array ADT
class Queue:
"""
circular Array ,通过头尾指针实现。list内置append和pop复杂度会退化,使用
环数组实现可以使得入队出队操作时间复杂度为O(1),缺点是数组长度需要固定。
"""
def __init__(self, maxSize):
self._count = 0
self._front = 0
self._back = maxSize - 1
self._qArray = Array(maxSize)
def isEmpty(self):
return self._count == 0
def isFull(self):
return self._count == len(self._qArray)
def __len__(self):
return len(self._count)
def enqueue(self, item):
assert not self.isFull()
maxSize = len(self._qArray)
self._back = (self._back + 1) % maxSize # 移动尾指针
self._qArray[self._back] = item
self._count += 1
def dequeue(self):
assert not self.isFull()
item = self._qArray[self._front]
maxSize = len(self._qArray)
self._front = (self._front + 1) % maxSize
self._count -= 1
return item
class _QueueNode:
def __init__(self, item):
self.item = item
class Queue:
""" Queue ADT, linked list 实现。为了改进环型数组有最大数量的限制,改用
带有头尾节点的linked list实现。
"""
def __init__(self):
self._qhead = None
self._qtail = None
self._qsize = 0
def isEmpty(self):
return self._qhead is None
def __len__(self):
return self._count
def enqueue(self, item):
node = _QueueNode(item) # 创建新的节点并用尾节点指向他
if self.isEmpty():
self._qhead = node
else:
self._qtail.next = node
self._qtail = node
self._qcount += 1
def dequeue(self):
assert not self.isEmpty(), 'Can not dequeue from an empty queue'
node = self._qhead
if self._qhead is self._qtail:
self._qtail = None
self._qhead = self._qhead.next # 前移头节点
self._count -= 1
return node.item
class UnboundedPriorityQueue:
""" PriorityQueue ADT: 给每个item加上优先级p,高优先级先dequeue
分为两种:
- bounded PriorityQueue: 限制优先级在一个区间[0...p)
- unbounded PriorityQueue: 不限制优先级
PriorityQueue()
BPriorityQueue(numLevels): create a bounded PriorityQueue with priority in range
[0, numLevels-1]
isEmpty()
length()
enqueue(item, priority): 如果是bounded PriorityQueue, priority必须在区间内
dequeue(): 最高优先级的出队,同优先级的按照FIFO顺序
- 两种实现方式:
1.入队的时候都是到队尾,出队操作找到最高优先级的出队,出队操作O(n)
2.始终维持队列有序,每次入队都找到该插入的位置,出队操作是O(1)
(注意如果用list实现list.append和pop操作复杂度会因内存分配退化)
"""
from collections import namedtuple
_PriorityQEntry = namedtuple('_PriorityQEntry', 'item, priority')
# 采用方式1,用内置list实现unbounded PriorityQueue
def __init__(self):
self._qlist = list()
def isEmpty(self):
return len(self) == 0
def __len__(self):
return len(self._qlist)
def enqueue(self, item, priority):
entry = UnboundedPriorityQueue._PriorityQEntry(item, priority)
self._qlist.append(entry)
def deque(self):
assert not self.isEmpty(), 'can not deque from an empty queue'
highest = self._qlist[0].priority
for i in range(len(self)): # 出队操作O(n),遍历找到最高优先级
if self._qlist[i].priority < highest:
highest = self._qlist[i].priority
entry = self._qlist.pop(highest)
return entry.item
class BoundedPriorityQueue:
""" BoundedPriorityQueue ADT,用linked list实现。上一个地方提到了 BoundedPriorityQueue
但是为什么需要 BoundedPriorityQueue呢? BoundedPriorityQueue 的优先级限制在[0, maxPriority-1]
对于 UnboundedPriorityQueue,出队操作由于要遍历寻找优先级最高的item,所以平均
是O(n)的操作,但是对于 BoundedPriorityQueue,用队列数组实现可以达到常量时间,
用空间换时间。比如要弹出一个元素,直接找到第一个非空队列弹出 元素就可以了。
(小数字代表高优先级,先出队)
qlist
[0] -> ["white"]
[1]
[2] -> ["black", "green"]
[3] -> ["purple", "yellow"]
"""
# Implementation of the bounded Priority Queue ADT using an array of #
# queues in which the queues are implemented using a linked list.
from array import Array # 第二章定义的ADT
def __init__(self, numLevels):
self._qSize = 0
self._qLevels = Array(numLevels)
for i in range(numLevels):
self._qLevels[i] = Queue() # 上一节讲到用linked list实现的Queue
def isEmpty(self):
return len(self) == 0
def __len__(self):
return len(self._qSize)
def enqueue(self, item, priority):
assert priority >= 0 and priority < len(self._qLevels), 'invalid priority'
self._qLevel[priority].enquue(item) # 直接找到 priority 对应的槽入队
def deque(self):
assert not self.isEmpty(), 'can not deque from an empty queue'
i = 0
p = len(self._qLevels)
while i < p and not self._qLevels[i].isEmpty(): # 找到第一个非空队列
i += 1
return self._qLevels[i].dequeue()
9章:Advanced Linked Lists
之前曾经介绍过单链表,一个链表节点只有data和next字段,本章介绍高级的链表。
Doubly Linked List,双链表,每个节点多了个prev指向前一个节点。双链表可以用来编写文本编辑器的buffer。
class DListNode:
def __init__(self, data):
self.data = data
self.prev = None
self.next = None
def revTraversa(tail):
curNode = tail
while cruNode is not None:
print(curNode.data)
curNode = curNode.prev
def search_sorted_doubly_linked_list(head, tail, probe, target):
""" probing technique探查法,改进直接遍历,不过最坏时间复杂度仍是O(n)
searching a sorted doubly linked list using the probing technique
Args:
head (DListNode obj)
tail (DListNode obj)
probe (DListNode or None)
target (DListNode.data): data to search
"""
if head is None: # make sure list is not empty
return False
if probe is None: # if probe is null, initialize it to first node
probe = head
# if the target comes before the probe node, we traverse backward, otherwise
# traverse forward
if target < probe.data:
while probe is not None and target <= probe.data:
if target == probe.dta:
return True
else:
probe = probe.prev
else:
while probe is not None and target >= probe.data:
if target == probe.data:
return True
else:
probe = probe.next
return False
def insert_node_into_ordered_doubly_linekd_list(value):
""" 最好画个图看,链表操作很容易绕晕,注意赋值顺序"""
newnode = DListNode(value)
if head is None: # empty list
head = newnode
tail = head
elif value < head.data: # insert before head
newnode.next = head
head.prev = newnode
head = newnode
elif value > tail.data: # insert after tail
newnode.prev = tail
tail.next = newnode
tail = newnode
else: # insert into middle
node = head
while node is not None and node.data < value:
node = node.next
newnode.next = node
newnode.prev = node.prev
node.prev.next = newnode
node.prev = newnode
循环链表
def travrseCircularList(listRef):
curNode = listRef
done = listRef is None
while not None:
curNode = curNode.next
print(curNode.data)
done = curNode is listRef # 回到遍历起始点
def searchCircularList(listRef, target):
curNode = listRef
done = listRef is None
while not done:
curNode = curNode.next
if curNode.data == target:
return True
else:
done = curNode is listRef or curNode.data > target
return False
def add_newnode_into_ordered_circular_linked_list(listRef, value):
""" 插入并维持顺序
1.插入空链表;2.插入头部;3.插入尾部;4.按顺序插入中间
"""
newnode = ListNode(value)
if listRef is None: # empty list
listRef = newnode
newnode.next = newnode
elif value < listRef.next.data: # insert in front
newnode.next = listRef.next
listRef.next = newnode
elif value > listRef.data: # insert in back
newnode.next = listRef.next
listRef.next = newnode
listRef = newnode
else: # insert in the middle
preNode = None
curNode = listRef
done = listRef is None
while not done:
preNode = curNode
preNode = curNode.next
done = curNode is listRef or curNode.data > value
newnode.next = curNode
preNode.next = newnode
10章:Recursion
Recursion is a process for solving problems by subdividing a larger problem into smaller cases of the problem itself and then solving the smaller, more trivial parts.
递归函数:调用自己的函数
# 递归函数:调用自己的函数,看一个最简单的递归函数,倒序打印一个数
def printRev(n):
if n > 0:
print(n)
printRev(n-1)
printRev(3) # 从10输出到1
# 稍微改一下,print放在最后就得到了正序打印的函数
def printInOrder(n):
if n > 0:
printInOrder(n-1)
print(n) # 之所以最小的先打印是因为函数一直递归到n==1时候的最深栈,此时不再
# 递归,开始执行print语句,这时候n==1,之后每跳出一层栈,打印更大的值
printInOrder(3) # 正序输出
Properties of Recursion: 使用stack解决的问题都能用递归解决
- A recursive solution must contain a base case; 递归出口,代表最小子问题(n == 0退出打印)
- A recursive solution must contain a recursive case; 可以分解的子问题
- A recursive solution must make progress toward the base case. 递减n使得n像递归出口靠近
Tail Recursion: occurs when a function includes a single recursive call as the last statement of the function. In this case, a stack is not needed to store values to te used upon the return of the recursive call and thus a solution can be implemented using a iterative loop instead.
# Recursive Binary Search
def recBinarySearch(target, theSeq, first, last):
# 你可以写写单元测试来验证这个函数的正确性
if first > last: # 递归出口1
return False
else:
mid = (first + last) // 2
if theSeq[mid] == target:
return True # 递归出口2
elif theSeq[mid] > target:
return recBinarySearch(target, theSeq, first, mid - 1)
else:
return recBinarySearch(target, theSeq, mid + 1, last)
11章:Hash Tables
基于比较的搜索(线性搜索,有序数组的二分搜索)最好的时间复杂度只能达到O(logn),利用hash可以实现O(1)查找,python内置dict的实现方式就是hash,你会发现dict的key必须要是实现了hash和eq方法的。
Hashing: hashing is the process of mapping a search a key to a limited range of array indeices with the goal of providing direct access to the keys.
hash方法有个hash函数用来给key计算一个hash值,作为数组下标,放到该下标对应的槽中。当不同key根据hash函数计算得到的下标相同时,就出现了冲突。解决冲突有很多方式,比如让每个槽成为链表,每次冲突以后放到该槽链表的尾部,但是查询时间就会退化,不再是O(1)。还有一种探查方式,当key的槽冲突时候,就会根据一种计算方式去寻找下一个空的槽存放,探查方式有线性探查,二次方探查法等,cpython解释器使用的是二次方探查法。还有一个问题就是当python使用的槽数量大于预分配的2/3时候,会重新分配内存并拷贝以前的数据,所以有时候dict的add操作代价还是比较高的,牺牲空间但是可以始终保证O(1)的查询效率。如果有大量的数据,建议还是使用bloomfilter或者redis提供的HyperLogLog。
如果你感兴趣,可以看看这篇文章,介绍c解释器如何实现的python dict对象:Python dictionary implementation。我们使用Python来实现一个类似的hash结构。
import ctypes
class Array: # 第二章曾经定义过的ADT,这里当做HashMap的槽数组使用
def __init__(self, size):
assert size > 0, 'array size must be > 0'
self._size = size
PyArrayType = ctypes.py_object * size
self._elements = PyArrayType()
self.clear(None)
def __len__(self):
return self._size
def __getitem__(self, index):
assert index >= 0 and index < len(self), 'out of range'
return self._elements[index]
def __setitem__(self, index, value):
assert index >= 0 and index < len(self), 'out of range'
self._elements[index] = value
def clear(self, value):
""" 设置每个元素为value """
for i in range(len(self)):
self._elements[i] = value
def __iter__(self):
return _ArrayIterator(self._elements)
class _ArrayIterator:
def __init__(self, items):
self._items = items
self._idx = 0
def __iter__(self):
return self
def __next__(self):
if self._idx < len(self._items):
val = self._items[self._idx]
self._idx += 1
return val
else:
raise StopIteration
class HashMap:
""" HashMap ADT实现,类似于python内置的dict
一个槽有三种状态:
1.从未使用 HashMap.UNUSED。此槽没有被使用和冲突过,查找时只要找到UNUSEd就不用再继续探查了
2.使用过但是remove了,此时是 HashMap.EMPTY,该探查点后边的元素扔可能是有key
3.槽正在使用 _MapEntry节点
"""
class _MapEntry: # 槽里存储的数据
def __init__(self, key, value):
self.key = key
self.value = value
UNUSED = None # 没被使用过的槽,作为该类变量的一个单例,下边都是is 判断
EMPTY = _MapEntry(None, None) # 使用过但是被删除的槽
def __init__(self):
self._table = Array(7) # 初始化7个槽
self._count = 0
# 超过2/3空间被使用就重新分配,load factor = 2/3
self._maxCount = len(self._table) - len(self._table) // 3
def __len__(self):
return self._count
def __contains__(self, key):
slot = self._findSlot(key, False)
return slot is not None
def add(self, key, value):
if key in self: # 覆盖原有value
slot = self._findSlot(key, False)
self._table[slot].value = value
return False
else:
slot = self._findSlot(key, True)
self._table[slot] = HashMap._MapEntry(key, value)
self._count += 1
if self._count == self._maxCount: # 超过2/3使用就rehash
self._rehash()
return True
def valueOf(self, key):
slot = self._findSlot(key, False)
assert slot is not None, 'Invalid map key'
return self._table[slot].value
def remove(self, key):
""" remove操作把槽置为EMPTY"""
assert key in self, 'Key error %s' % key
slot = self._findSlot(key, forInsert=False)
value = self._table[slot].value
self._count -= 1
self._table[slot] = HashMap.EMPTY
return value
def __iter__(self):
return _HashMapIteraotr(self._table)
def _slot_can_insert(self, slot):
return (self._table[slot] is HashMap.EMPTY or
self._table[slot] is HashMap.UNUSED)
def _findSlot(self, key, forInsert=False):
""" 注意原书有错误,代码根本不能运行,这里我自己改写的
Args:
forInsert (bool): if the search is for an insertion
Returns:
slot or None
"""
slot = self._hash1(key)
step = self._hash2(key)
_len = len(self._table)
if not forInsert: # 查找是否存在key
while self._table[slot] is not HashMap.UNUSED:
# 如果一个槽是UNUSED,直接跳出
if self._table[slot] is HashMap.EMPTY:
slot = (slot + step) % _len
continue
elif self._table[slot].key == key:
return slot
slot = (slot + step) % _len
return None
else: # 为了插入key
while not self._slot_can_insert(slot): # 循环直到找到一个可以插入的槽
slot = (slot + step) % _len
return slot
def _rehash(self): # 当前使用槽数量大于2/3时候重新创建新的table
origTable = self._table
newSize = len(self._table) * 2 + 1 # 原来的2*n+1倍
self._table = Array(newSize)
self._count = 0
self._maxCount = newSize - newSize // 3
# 将原来的key value添加到新的table
for entry in origTable:
if entry is not HashMap.UNUSED and entry is not HashMap.EMPTY:
slot = self._findSlot(entry.key, True)
self._table[slot] = entry
self._count += 1
def _hash1(self, key):
""" 计算key的hash值"""
return abs(hash(key)) % len(self._table)
def _hash2(self, key):
""" key冲突时候用来计算新槽的位置"""
return 1 + abs(hash(key)) % (len(self._table)-2)
class _HashMapIteraotr:
def __init__(self, array):
self._array = array
self._idx = 0
def __iter__(self):
return self
def __next__(self):
if self._idx < len(self._array):
if self._array[self._idx] is not None and self._array[self._idx].key is not None:
key = self._array[self._idx].key
self._idx += 1
return key
else:
self._idx += 1
else:
raise StopIteration
def print_h(h):
for idx, i in enumerate(h):
print(idx, i)
print('\n')
def test_HashMap():
""" 一些简单的单元测试,不过测试用例覆盖不是很全面 """
h = HashMap()
assert len(h) == 0
h.add('a', 'a')
assert h.valueOf('a') == 'a'
assert len(h) == 1
a_v = h.remove('a')
assert a_v == 'a'
assert len(h) == 0
h.add('a', 'a')
h.add('b', 'b')
assert len(h) == 2
assert h.valueOf('b') == 'b'
b_v = h.remove('b')
assert b_v == 'b'
assert len(h) == 1
h.remove('a')
assert len(h) == 0
n = 10
for i in range(n):
h.add(str(i), i)
assert len(h) == n
print_h(h)
for i in range(n):
assert str(i) in h
for i in range(n):
h.remove(str(i))
assert len(h) == 0
12章:Advanced Sorting
第5章介绍了基本的排序算法,本章介绍高级排序算法。
归并排序(mergesort): 分治法
def merge_sorted_list(listA, listB):
""" 归并两个有序数组,O(max(m, n)) ,m和n是数组长度"""
print('merge left right list', listA, listB, end='')
new_list = list()
a = b = 0
while a < len(listA) and b < len(listB):
if listA[a] < listB[b]:
new_list.append(listA[a])
a += 1
else:
new_list.append(listB[b])
b += 1
while a < len(listA):
new_list.append(listA[a])
a += 1
while b < len(listB):
new_list.append(listB[b])
b += 1
print(' ->', new_list)
return new_list
def mergesort(theList):
""" O(nlogn),log层调用,每层n次操作
mergesort: divided and conquer 分治
1. 把原数组分解成越来越小的子数组
2. 合并子数组来创建一个有序数组
"""
print(theList) # 我把关键步骤打出来了,你可以运行下看看整个过程
if len(theList) <= 1: # 递归出口
return theList
else:
mid = len(theList) // 2
# 递归分解左右两边数组
left_half = mergesort(theList[:mid])
right_half = mergesort(theList[mid:])
# 合并两边的有序子数组
newList = merge_sorted_list(left_half, right_half)
return newList
""" 这是我调用一次打出来的排序过程
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
[10, 9, 8, 7, 6]
[10, 9]
[10]
[9]
merge left right list [10] [9] -> [9, 10]
[8, 7, 6]
[8]
[7, 6]
[7]
[6]
merge left right list [7] [6] -> [6, 7]
merge left right list [8] [6, 7] -> [6, 7, 8]
merge left right list [9, 10] [6, 7, 8] -> [6, 7, 8, 9, 10]
[5, 4, 3, 2, 1]
[5, 4]
[5]
[4]
merge left right list [5] [4] -> [4, 5]
[3, 2, 1]
[3]
[2, 1]
[2]
[1]
merge left right list [2] [1] -> [1, 2]
merge left right list [3] [1, 2] -> [1, 2, 3]
merge left right list [4, 5] [1, 2, 3] -> [1, 2, 3, 4, 5]
"""
快速排序
def quicksort(theSeq, first, last):
"""
quicksort :也是分而治之,但是和归并排序不同的是,采用选定主元(pivot)而不是从中间
进行数组划分
1. 第一步选定pivot用来划分数组,pivot左边元素都比它小,右边元素都大于等于它
2. 对划分的左右两边数组递归,直到递归出口(数组元素数目小于2)
3. 对pivot和左右划分的数组合并成一个有序数组
"""
if first < last:
pos = partitionSeq(theSeq, first, last)
# 对划分的子数组递归操作
quicksort(theSeq, first, pos - 1)
quicksort(theSeq, pos + 1, last)
def partitionSeq(theSeq, first, last):
""" 快排中的划分操作,把比pivot小的挪到左边,比pivot大的挪到右边"""
pivot = theSeq[first]
print('before partitionSeq', theSeq)
left = first + 1
right = last
while True:
# 找到第一个比pivot大的
while left <= right and theSeq[left] < pivot:
left += 1
# 从右边开始找到比pivot小的
while right >= left and theSeq[right] >= pivot:
right -= 1
if right < left:
break
else:
theSeq[left], theSeq[right] = theSeq[right], theSeq[left]
# 把pivot放到合适的位置
theSeq[first], theSeq[right] = theSeq[right], theSeq[first]
print('after partitionSeq {}: {}\t'.format(theSeq, pivot))
return right # 返回pivot的位置
def test_partitionSeq():
l = [0,1,2,3,4]
assert partitionSeq(l, 0, len(l)-1) == 0
l = [4,3,2,1,0]
assert partitionSeq(l, 0, len(l)-1) == 4
l = [2,3,0,1,4]
assert partitionSeq(l, 0, len(l)-1) == 2
test_partitionSeq()
def test_quicksort():
def _is_sorted(seq):
for i in range(len(seq)-1):
if seq[i] > seq[i+1]:
return False
return True
from random import randint
for i in range(100):
_len = randint(1, 100)
to_sort = []
for i in range(_len):
to_sort.append(randint(0, 100))
quicksort(to_sort, 0, len(to_sort)-1) # 注意这里用了原地排序,直接更改了数组
print(to_sort)
assert _is_sorted(to_sort)
test_quicksort()
利用快排中的partitionSeq操作,我们还能实现另一个算法,nth_element,快速查找一个无序数组中的第k大元素
def nth_element(seq, beg, end, k):
if beg == end:
return seq[beg]
pivot_index = partitionSeq(seq, beg, end)
if pivot_index == k:
return seq[k]
elif pivot_index > k:
return nth_element(seq, beg, pivot_index-1, k)
else:
return nth_element(seq, pivot_index+1, end, k)
def test_nth_element():
from random import shuffle
n = 10
l = list(range(n))
shuffle(l)
print(l)
for i in range(len(l)):
assert nth_element(l, 0, len(l)-1, i) == i
test_nth_element()
13章:Binary Tree
The binary Tree: 二叉树,每个节点做多只有两个子节点
class _BinTreeNode:
def __init__(self, data):
self.data = data
self.left = None
self.right = None
# 三种depth-first遍历
def preorderTrav(subtree):
""" 先(根)序遍历"""
if subtree is not None:
print(subtree.data)
preorderTrav(subtree.left)
preorderTrav(subtree.right)
def inorderTrav(subtree):
""" 中(根)序遍历"""
if subtree is not None:
preorderTrav(subtree.left)
print(subtree.data)
preorderTrav(subtree.right)
def postorderTrav(subtree):
""" 后(根)序遍历"""
if subtree is not None:
preorderTrav(subtree.left)
preorderTrav(subtree.right)
print(subtree.data)
# 宽度优先遍历(bradth-First Traversal): 一层一层遍历, 使用queue
def breadthFirstTrav(bintree):
from queue import Queue # py3
q = Queue()
q.put(bintree)
while not q.empty():
node = q.get()
print(node.data)
if node.left is not None:
q.put(node.left)
if node.right is not None:
q.put(node.right)
class _ExpTreeNode:
__slots__ = ('element', 'left', 'right')
def __init__(self, data):
self.element = data
self.left = None
self.right = None
def __repr__(self):
return '<_ExpTreeNode: {} {} {}>'.format(
self.element, self.left, self.right)
from queue import Queue
class ExpressionTree:
"""
表达式树: 操作符存储在内节点操作数存储在叶子节点的二叉树。(符号树真难打出来)
*
/ \
+ -
/ \ / \
9 3 8 4
(9+3) * (8-4)
Expression Tree Abstract Data Type,可以实现二元操作符
ExpressionTree(expStr): user string as constructor param
evaluate(varDict): evaluates the expression and returns the numeric result
toString(): constructs and retutns a string represention of the expression
Usage:
vars = {'a': 5, 'b': 12}
expTree = ExpressionTree("(a/(b-3))")
print('The result = ', expTree.evaluate(vars))
"""
def __init__(self, expStr):
self._expTree = None
self._buildTree(expStr)
def evaluate(self, varDict):
return self._evalTree(self._expTree, varDict)
def __str__(self):
return self._buildString(self._expTree)
def _buildString(self, treeNode):
""" 在一个子树被遍历之前添加做括号,在子树被遍历之后添加右括号 """
# print(treeNode)
if treeNode.left is None and treeNode.right is None:
return str(treeNode.element) # 叶子节点是操作数直接返回
else:
expStr = '('
expStr += self._buildString(treeNode.left)
expStr += str(treeNode.element)
expStr += self._buildString(treeNode.right)
expStr += ')'
return expStr
def _evalTree(self, subtree, varDict):
# 是不是叶子节点, 是的话说明是操作数,直接返回
if subtree.left is None and subtree.right is None:
# 操作数是合法数字吗
if subtree.element >= '0' and subtree.element <= '9':
return int(subtree.element)
else: # 操作数是个变量
assert subtree.element in varDict, 'invalid variable.'
return varDict[subtree.element]
else: # 操作符则计算其子表达式
lvalue = self._evalTree(subtree.left, varDict)
rvalue = self._evalTree(subtree.right, varDict)
print(subtree.element)
return self._computeOp(lvalue, subtree.element, rvalue)
def _computeOp(self, left, op, right):
assert op
op_func = {
'+': lambda left, right: left + right, # or import operator, operator.add
'-': lambda left, right: left - right,
'*': lambda left, right: left * right,
'/': lambda left, right: left / right,
'%': lambda left, right: left % right,
}
return op_func[op](left, right)
def _buildTree(self, expStr):
expQ = Queue()
for token in expStr: # 遍历表达式字符串的每个字符
expQ.put(token)
self._expTree = _ExpTreeNode(None) # 创建root节点
self._recBuildTree(self._expTree, expQ)
def _recBuildTree(self, curNode, expQ):
token = expQ.get()
if token == '(':
curNode.left = _ExpTreeNode(None)
self._recBuildTree(curNode.left, expQ)
# next token will be an operator: + = * / %
curNode.element = expQ.get()
curNode.right = _ExpTreeNode(None)
self._recBuildTree(curNode.right, expQ)
# the next token will be ')', remmove it
expQ.get()
else: # the token is a digit that has to be converted to an int.
curNode.element = token
vars = {'a': 5, 'b': 12}
expTree = ExpressionTree("((2*7)+8)")
print(expTree)
print('The result = ', expTree.evaluate(vars))
Heap(堆):二叉树最直接的一个应用就是实现堆。堆就是一颗完全二叉树,最大堆的非叶子节点的值都比孩子大,最小堆的非叶子结点的值都比孩子小。 python内置了heapq模块帮助我们实现堆操作,比如用内置的heapq模块实现个堆排序:
# 使用python内置的heapq实现heap sort
def heapsort(iterable):
from heapq import heappush, heappop
h = []
for value in iterable:
heappush(h, value)
return [heappop(h) for i in range(len(h))]
但是一般实现堆的时候实际上并不是用数节点来实现的,而是使用数组实现,效率比较高。为什么可以用数组实现呢?因为完全二叉树的性质,
可以用下标之间的关系表示节点之间的关系,MaxHeap的docstring中已经说明了
class MaxHeap:
"""
Heaps:
完全二叉树,最大堆的非叶子节点的值都比孩子大,最小堆的非叶子结点的值都比孩子小
Heap包含两个属性,order property 和 shape property(a complete binary tree),在插入
一个新节点的时候,始终要保持这两个属性
插入操作:保持堆属性和完全二叉树属性, sift-up 操作维持堆属性
extract操作:只获取根节点数据,并把树最底层最右节点copy到根节点后,sift-down操作维持堆属性
用数组实现heap,从根节点开始,从上往下从左到右给每个节点编号,则根据完全二叉树的
性质,给定一个节点i, 其父亲和孩子节点的编号分别是:
parent = (i-1) // 2
left = 2 * i + 1
rgiht = 2 * i + 2
使用数组实现堆一方面效率更高,节省树节点的内存占用,一方面还可以避免复杂的指针操作,减少
调试难度。
"""
def __init__(self, maxSize):
self._elements = Array(maxSize) # 第二章实现的Array ADT
self._count = 0
def __len__(self):
return self._count
def capacity(self):
return len(self._elements)
def add(self, value):
assert self._count < self.capacity(), 'can not add to full heap'
self._elements[self._count] = value
self._count += 1
self._siftUp(self._count - 1)
self.assert_keep_heap() # 确定每一步add操作都保持堆属性
def extract(self):
assert self._count > 0, 'can not extract from an empty heap'
value = self._elements[0] # save root value
self._count -= 1
self._elements[0] = self._elements[self._count] # 最右下的节点放到root后siftDown
self._siftDown(0)
self.assert_keep_heap()
return value
def _siftUp(self, ndx):
if ndx > 0:
parent = (ndx - 1) // 2
# print(ndx, parent)
if self._elements[ndx] > self._elements[parent]: # swap
self._elements[ndx], self._elements[parent] = self._elements[parent], self._elements[ndx]
self._siftUp(parent) # 递归
def _siftDown(self, ndx):
left = 2 * ndx + 1
right = 2 * ndx + 2
# determine which node contains the larger value
largest = ndx
if (left < self._count and
self._elements[left] >= self._elements[largest] and
self._elements[left] >= self._elements[right]): # 原书这个地方没写实际上找的未必是largest
largest = left
elif right < self._count and self._elements[right] >= self._elements[largest]:
largest = right
if largest != ndx:
self._elements[ndx], self._elements[largest] = self._elements[largest], self._elements[ndx]
self._siftDown(largest)
def __repr__(self):
return ' '.join(map(str, self._elements))
def assert_keep_heap(self):
""" 我加了这个函数是用来验证每次add或者extract之后,仍保持最大堆的性质"""
_len = len(self)
for i in range(0, int((_len-1)/2)): # 内部节点(非叶子结点)
l = 2 * i + 1
r = 2 * i + 2
if l < _len and r < _len:
assert self._elements[i] >= self._elements[l] and self._elements[i] >= self._elements[r]
def test_MaxHeap():
""" 最大堆实现的单元测试用例 """
_len = 10
h = MaxHeap(_len)
for i in range(_len):
h.add(i)
h.assert_keep_heap()
for i in range(_len):
# 确定每次出来的都是最大的数字,添加的时候是从小到大添加的
assert h.extract() == _len-i-1
test_MaxHeap()
def simpleHeapSort(theSeq):
""" 用自己实现的MaxHeap实现堆排序,直接修改原数组实现inplace排序"""
if not theSeq:
return theSeq
_len = len(theSeq)
heap = MaxHeap(_len)
for i in theSeq:
heap.add(i)
for i in reversed(range(_len)):
theSeq[i] = heap.extract()
return theSeq
def test_simpleHeapSort():
""" 用一些测试用例证明实现的堆排序是可以工作的 """
def _is_sorted(seq):
for i in range(len(seq)-1):
if seq[i] > seq[i+1]:
return False
return True
from random import randint
assert simpleHeapSort([]) == []
for i in range(1000):
_len = randint(1, 100)
to_sort = []
for i in range(_len):
to_sort.append(randint(0, 100))
simpleHeapSort(to_sort) # 注意这里用了原地排序,直接更改了数组
assert _is_sorted(to_sort)
test_simpleHeapSort()
14章:Search Trees
二叉差找树性质:对每个内部节点V,
- 所有key小于V.key的存储在V的左子树。
- 所有key大于V.key的存储在V的右子树
对BST进行中序遍历会得到升序的key序列
class _BSTMapNode:
__slots__ = ('key', 'value', 'left', 'right')
def __init__(self, key, value):
self.key = key
self.value = value
self.left = None
self.right = None
def __repr__(self):
return '<{}:{}> left:{}, right:{}'.format(
self.key, self.value, self.left, self.right)
__str__ = __repr__
class BSTMap:
""" BST,树节点包含key可payload。用BST来实现之前用hash实现过的Map ADT.
性质:对每个内部节点V,
1.对于节点V,所有key小于V.key的存储在V的左子树。
2.所有key大于V.key的存储在V的右子树
对BST进行中序遍历会得到升序的key序列
"""
def __init__(self):
self._root = None
self._size = 0
self._rval = None # 作为remove的返回值
def __len__(self):
return self._size
def __iter__(self):
return _BSTMapIterator(self._root, self._size)
def __contains__(self, key):
return self._bstSearch(self._root, key) is not None
def valueOf(self, key):
node = self._bstSearch(self._root, key)
assert node is not None, 'Invalid map key.'
return node.value
def _bstSearch(self, subtree, target):
if subtree is None: # 递归出口,遍历到树底没有找到key或是空树
return None
elif target < subtree.key:
return self._bstSearch(subtree.left, target)
elif target > subtree.key:
return self._bstSearch(subtree.right, target)
return subtree # 返回引用
def _bstMinumum(self, subtree):
""" 顺着树一直往左下角递归找就是最小的,向右下角递归就是最大的 """
if subtree is None:
return None
elif subtree.left is None:
return subtree
else:
return subtree._bstMinumum(self, subtree.left)
def add(self, key, value):
""" 添加或者替代一个key的value, O(N) """
node = self._bstSearch(self._root, key)
if node is not None: # if key already exists, update value
node.value = value
return False
else: # insert a new entry
self._root = self._bstInsert(self._root, key, value)
self._size += 1
return True
def _bstInsert(self, subtree, key, value):
""" 新的节点总是插入在树的叶子结点上 """
if subtree is None:
subtree = _BSTMapNode(key, value)
elif key < subtree.key:
subtree.left = self._bstInsert(subtree.left, key, value)
elif key > subtree.key:
subtree.right = self._bstInsert(subtree.right, key, value)
# 注意这里没有else语句了,应为在被调用处add函数里先判断了是否有重复key
return subtree
def remove(self, key):
""" O(N)
被删除的节点分为三种:
1.叶子结点:直接把其父亲指向该节点的指针置None
2.该节点有一个孩子: 删除该节点后,父亲指向一个合适的该节点的孩子
3.该节点有俩孩子:
(1)找到要删除节点N和其后继S(中序遍历后该节点下一个)
(2)复制S的key到N
(3)从N的右子树中删除后继S(即在N的右子树中最小的)
"""
assert key in self, 'invalid map key'
self._root = self._bstRemove(self._root, key)
self._size -= 1
return self._rval
def _bstRemove(self, subtree, target):
# search for the item in the tree
if subtree is None:
return subtree
elif target < subtree.key:
subtree.left = self._bstRemove(subtree.left, target)
return subtree
elif target > subtree.key:
subtree.right = self._bstRemove(subtree.right, target)
return subtree
else: # found the node containing the item
self._rval = subtree.value
if subtree.left is None and subtree.right is None:
# 叶子node
return None
elif subtree.left is None or subtree.right is None:
# 有一个孩子节点
if subtree.left is not None:
return subtree.left
else:
return subtree.right
else: # 有俩孩子节点
successor = self._bstMinumum(subtree.right)
subtree.key = successor.key
subtree.value = successor.value
subtree.right = self._bstRemove(subtree.right, successor.key)
return subtree
def __repr__(self):
return '->'.join([str(i) for i in self])
def assert_keep_bst_property(self, subtree):
""" 写这个函数为了验证add和delete操作始终维持了bst的性质 """
if subtree is None:
return
if subtree.left is not None and subtree.right is not None:
assert subtree.left.value <= subtree.value
assert subtree.right.value >= subtree.value
self.assert_keep_bst_property(subtree.left)
self.assert_keep_bst_property(subtree.right)
elif subtree.left is None and subtree.right is not None:
assert subtree.right.value >= subtree.value
self.assert_keep_bst_property(subtree.right)
elif subtree.left is not None and subtree.right is None:
assert subtree.left.value <= subtree.value
self.assert_keep_bst_property(subtree.left)
class _BSTMapIterator:
def __init__(self, root, size):
self._theKeys = Array(size)
self._curItem = 0
self._bstTraversal(root)
self._curItem = 0
def __iter__(self):
return self
def __next__(self):
if self._curItem < len(self._theKeys):
key = self._theKeys[self._curItem]
self._curItem += 1
return key
else:
raise StopIteration
def _bstTraversal(self, subtree):
if subtree is not None:
self._bstTraversal(subtree.left)
self._theKeys[self._curItem] = subtree.key
self._curItem += 1
self._bstTraversal(subtree.right)
def test_BSTMap():
l = [60, 25, 100, 35, 17, 80]
bst = BSTMap()
for i in l:
bst.add(i)
def test_HashMap():
""" 之前用来测试用hash实现的map,改为用BST实现的Map测试 """
# h = HashMap()
h = BSTMap()
assert len(h) == 0
h.add('a', 'a')
assert h.valueOf('a') == 'a'
assert len(h) == 1
a_v = h.remove('a')
assert a_v == 'a'
assert len(h) == 0
h.add('a', 'a')
h.add('b', 'b')
assert len(h) == 2
assert h.valueOf('b') == 'b'
b_v = h.remove('b')
assert b_v == 'b'
assert len(h) == 1
h.remove('a')
assert len(h) == 0
_len = 10
for i in range(_len):
h.add(str(i), i)
assert len(h) == _len
for i in range(_len):
assert str(i) in h
for i in range(_len):
print(len(h))
print('bef', h)
_ = h.remove(str(i))
assert _ == i
print('aft', h)
print(len(h))
assert len(h) == 0
test_HashMap()