ClickHouse集群缩容,为保证数据不丢失,计划将需要缩容的节点上的数据,迁移到其他节点上,保证迁移到每个机器上的数据量尽量均衡。数据的迁移已partition为单位,已知每个partition的数据量。
将一个包含m个整数的数组分成n个数组,每个数组的和尽量接近
这个问题是典型的动态规划的问题,理论上是无法找到最优解的,但是本次只是为了解决实际生产中的问题,而不是要AC,所以我们只需要找到一个相对合理的算法,使得partition的分配相对均衡就好了。
输入:int数组,分组数divisionNum
我们举一个栗子:
数组为:500, 18, 28, 2, 27, 35, 22, 10, 6, 5, 3, 2, 1;分为4组
排序为:500, 35, 28, 27, 22, 18, 10, 6, 5, 3, 2, 2, 1
计算平均值 avg = 164.75
遍历数组:
结果:
arr 0 is : 500, sum = 500
arr 1 is : 35 18, sum = 53
arr 2 is : 28 22 3, sum = 53
arr 3 is : 27 10 6 5 2 2 1, sum = 53
import math
# 将数组分成n个数组,每个数组的和尽量接近
def GetAvgArr(number_list, arr_num):
avg_arrays = []
if len(number_list) == 0 or len(number_list) < arr_num:
return avg_arrays
# 1. 计算平均值
sum_value = 0
mean_value = 0
number_list_float = []
for num in number_list:
number_list_float.append(float(num))
sum_value += float(num)
mean_value = sum_value / float(arr_num)
# 2. 倒序排序
number_list_float.sort(reverse=True)
for cnt in range(0, arr_num):
# print("mean = ", mean_value)
arr = []
if cnt == arr_num-1:
# 最后一组,返回数组剩余所有数
avg_arrays.append(transFloatToIntList(number_list_float))
break
# 如果最大的数 max >= mean,这个数单独一个组
if len(number_list_float) > 0 and number_list_float[0] >= mean_value:
arr = [number_list_float[0]]
avg_arrays.append(transFloatToIntList(arr))
sum_value = sum_value - number_list_float[0]
# 重新计算剩下partition的平均值
mean_value = sum_value / float(arr_num-len(avg_arrays))
else:
# 否则寻找一组数据
arr, _ = getList(number_list_float, mean_value, math.pow(mean_value, 2))
avg_arrays.append(transFloatToIntList(arr))
# 将已经形成一组的数据从原数组中移除,准备寻找下一组数据
number_list_float = removeFromFloatList(number_list_float, arr)
return avg_arrays
# 将[]float转为[]int
def transFloatToIntList(float_list):
res = []
for item in float_list:
res.append(int(item))
return res
# 将在 remove_nums 中出现过的数字从 originalList 中移除
def removeFromFloatList(original_list, remove_nums):
res = []
start = 0
for remove in remove_nums:
for i in range(start, len(original_list)):
if original_list[i] == remove:
res.extend(original_list[start:i])
start = i + 1
break
res.extend(original_list[start:])
return res
def getList(arr, delta, distance):
res = []
if len(arr) == 0:
return res, -1
for i in range(0, len(arr)-1):
if delta == arr[i]:
res.append(arr[i])
return res, 0
elif delta < arr[i]:
continue
elif delta > arr[i]:
if i == 0:
res.append(arr[i])
delta = delta - arr[i]
distance = math.pow(delta, 2)
tmp, d = getList(arr[i+1:], delta, distance)
res.extend(tmp)
return res, d
else:
dis1 = math.pow(arr[i-1]-delta, 2)
dis2 = math.pow(delta-arr[i], 2)
if dis1 > dis2:
res.append(arr[i])
delta = delta - arr[i]
tmp, d = getList(arr[i+1:], delta, dis2)
res.extend(tmp)
return res, d
else:
tmp, d = getList(arr[i:], delta, dis2)
if dis1 > d:
res.extend(tmp)
return res, d
res.append(arr[i-1])
return res, dis1
dis = math.pow(delta-arr[len(arr)-1], 2)
if dis < distance:
return arr[len(arr)-1:], dis
return [], -1
测试
def main():
partition_list = [500, 18, 28, 2, 27, 35, 22, 10, 6, 5, 3, 2, 1]
print("test partitionList is : {}".format(partition_list))
print("result is :")
arrays = GetAvgArr(partition_list, 4)
for i, arr in enumerate(arrays):
print("arr {} is : {}, ".format(i, arr))
sum_value = 0
for value in arr:
sum_value += value
print("sum = {}".format(sum_value))