top k算法是什么大家都明白,就不废话了,这里直接上代码。本人用python实现了3种常用算法,命名如下:
heap_bigk1,用堆排序对构建整个数据的大根堆,然后取前k个。
heap_bigk2 初始取前k个数据构建一个小根堆,然后扫描整个数组,发现有更大的数就更新堆。
quick_bigk 用快速排序思想实现的top k算法。
#用堆排序思想实现的两种算法
def maxheap_adjust( array, arrsize, pos ):
if pos >= arrsize:
return
parval = array[pos]
curpos = pos
while curpos < arrsize:
lchildpos = ( curpos + 1 ) * 2 - 1
rchildpos = lchildpos + 1
if lchildpos >= arrsize:
return
elif rchildpos >= arrsize:
lchild = array[lchildpos]
if parval >= lchild:
return
array[curpos],array[lchildpos] = array[lchildpos],array[curpos]
curpos = lchildpos
else:
lchild = array[lchildpos]
rchild = array[rchildpos]
if parval >= lchild and parval >= rchild:
return
elif lchild > rchild:
array[curpos],array[lchildpos] = array[lchildpos],array[curpos]
curpos = lchildpos
else:
array[curpos],array[rchildpos] = array[rchildpos],array[curpos]
curpos = rchildpos
def minheap_adjust( array, arrsize, pos ):
if pos >= arrsize:
return
parval = array[pos]
curpos = pos
while curpos < arrsize:
lchildpos = ( curpos + 1 ) * 2 - 1
rchildpos = lchildpos + 1
if lchildpos >= arrsize:
return
elif rchildpos >= arrsize:
lchild = array[lchildpos]
if parval <= lchild:
return
array[curpos],array[lchildpos] = array[lchildpos],array[curpos]
curpos = lchildpos
else:
lchild = array[lchildpos]
rchild = array[rchildpos]
if parval <= lchild and parval <= rchild:
return
elif lchild < rchild:
array[curpos],array[lchildpos] = array[lchildpos],array[curpos]
curpos = lchildpos
else:
array[curpos],array[rchildpos] = array[rchildpos],array[curpos]
curpos = rchildpos
def makeheap( array, arrsize, ismaxheap ):
lastparpos = (int)(arrsize / 2) - 1
if ismaxheap:
for i in range( lastparpos, -1, -1 ):
maxheap_adjust( array, arrsize, i )
else:
for i in range( lastparpos, -1, -1 ):
minheap_adjust( array, arrsize, i )
def heap_bigk1( array, arrsize, k ):
makeheap( array, arrsize, True )
for i in range( arrsize, arrsize - k, -1 ):
array[0],array[i-1] = array[i-1],array[0]
maxheap_adjust( array, i - 1, 0 )
return array[arrsize - k:arrsize]
def heap_bigk2( array, arrsize, k ):
makeheap( array, k, False )
for i in range( k, arrsize ):
if array[i] > array[0]:
array[0] = array[i]
minheap_adjust( array, k, 0 )
return array[0:k]
#基于快速排序的算法实现
def partition( array, start, end, incorder ):
rand = random.randint( start, end )
array[rand],array[end] = array[end], array[rand]
pivotval = array[end]
storeindex = start
if incorder:
for i in range( start, end ):
if array[i] < pivotval:
temp = array[storeindex]
array[storeindex] = array[i]
array[i] = temp
storeindex += 1
else:
for i in range( start, end ):
if array[i] > pivotval:
temp = array[storeindex]
array[storeindex] = array[i]
array[i] = temp
storeindex += 1
temp = array[storeindex]
array[storeindex] = array[end]
array[end] = temp
return storeindex
find = False;
def quick_bigk( array, start, end, k ):
global find
if find:
return
if end <= start:
return
newpivot = partition( array, start, end, False )
if newpivot == k or newpivot == k - 1:
find = True
return
elif newpivot < k - 1:
quick_bigk( array, newpivot + 1, end, k )
else:
quick_bigk( array, start, newpivot - 1, k )
运行结果:
array_size=100000 k=10
heap bigk1 time:0.120652s
heap bigk2 time:0.018444s
quick bigk time:0.068579s
array_size=100000 k=50
heap bigk1 time:0.111971s
heap bigk2 time:0.019395s
quick bigk time:0.000005s
array_size=100000 k=100
heap bigk1 time:0.112395s
heap bigk2 time:0.021576s
quick bigk time:0.000006s
array_size=1000000 k=10
heap bigk1 time:1.140459s
heap bigk2 time:0.176139s
quick bigk time:0.000016s
array_size=1000000 k=50
heap bigk1 time:1.152394s
heap bigk2 time:0.177135s
quick bigk time:0.000017s
array_size=1000000 k=100
heap bigk1 time:1.156271s
heap bigk2 time:0.177470s
quick bigk time:0.000017s
array_size=10000000 k=10
heap bigk1 time:11.493444s
heap bigk2 time:1.732685s
quick bigk time:0.000010s
array_size=10000000 k=50
heap bigk1 time:11.498372s
heap bigk2 time:1.726918s
quick bigk time:0.000010s
array_size=10000000 k=100
heap bigk1 time:11.577869s
heap bigk2 time:1.741433s
quick bigk time:0.000006s
简单分析:
heap_bigk1需要构建整个数据的堆,它的时间复杂度为 N + klogN
heap_bigk2的时间复杂度k + (N - k)logk,一般情况下N会很大,K则很小,可以简化为Nlogk,理论分析来看, heap_bigk2应该比heap_bigk1要慢,但实际运行却更快,本人还未找到其中的原因, 可能是heap_bigk1实现得不 够好。
quick_bigk的平均时间复杂度为2N,但实际运行往往比平均时间要小得多,从上面的运行结果就可以看出。
结论:
在内存足够情况下,quick_bigk由于heap_bigk2,heap_bigk2优于heap_bigk1。
在内存受限的情况下,heap_bigk2不失为一种好方法,其他两种方法则要采用分组多趟的方法进行。