每个RDD由多分区组成的,实际开发建议对每个分区数据的进行操作,map函数使用mapPartitions代替、foreache函数使用foreachPartition代替。
def f(iterator):
for x in iterator:
print(x)
sc.parallelize([1,2,3,4,5]).foreachPartition(f)
rdd = sc.parallelize([1,2,3,4],2)
def f(iterator):
yield sum(iterator)
print(rdd.mapPartitions(f).collect())
对RDD中分区数目进行调整(增加分区或减少分区)
>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
>>> sorted(rdd.glom().collect())
[[1], [2, 3], [4, 5], [6, 7]]
>>> len(rdd.repartition(2).glom().collect())
2
>>> len(rdd.repartition(10).glom().collect())
10
>>> rdd.glom().collect()
[[1], [2, 3], [4, 5], [6, 7]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
[[1], [2, 3], [4, 5]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
[[1, 2, 3, 4, 5]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(4).glom().collect()
[[1], [2, 3], [4, 5]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(4,True).glom().collect()
[[4, 5], [2, 3], [], [1]]
>>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))
>>> pairs.getNumPartitions()
2
>>> pairs.partitionBy(3).glom().collect()
[[(3, 3)], [(1, 1), (4, 4), (4, 4), (1, 1)], [(2, 2), (2, 2)]]
>>> len(pairs.partitionBy(3).glom().collect())
3
from functools import reduce
def add(x,y):
return x + y
sum1 = reduce(add,[1,2,3,4,5])
sum2 = reduce(lambda x,y:x+y, [1,2,3,4,5])
print(sum1)
print(sum2)
>>> from operator import add
>>> sc.parallelize([1,2,3,4,5]).reduce(add)
15
>>> sc.parallelize((2 for _ in range(10))).map(lambda x:1).cache().reduce(add)
10
>>> from operator import add
>>> sc.parallelize([1,2,3,4,5]).fold(0,add)
15
>>> sc.parallelize([1,2,3,4,5]).fold(10,add)
45
>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
>>> sorted(rdd.groupByKey().mapValues(len).collect())
[('a', 2), ('b', 1)]
>>> sorted(rdd.groupByKey().mapValues(list).collect())
[('a', [1, 1]), ('b', [1])]
>>> sorted(rdd.reduceByKey(add).collect())
[('a', 2), ('b', 1)]
>>> sorted(rdd.foldByKey(0, add).collect())
[('a', 2), ('b', 1)]
>>> rdd.aggregateByKey(0, add, add).collect()
[('b', 1), ('a', 2)]
>>> sorted(rdd.aggregateByKey(0, add, add).collect())
[('a', 2), ('b', 1)]
>>> x = sc.parallelize([(1001, "zhangsan"), (1002, "lisi"), (1003, "wangwu"), (1004, "zhangliu")])
>>> y = sc.parallelize([(1001, "sales"), (1002, "tech")])
>>> x.join(y).collect()
[(1001, ('zhangsan', 'sales')), (1002, ('lisi', 'tech'))]
>>> x.leftOuterJoin(y).collect()
[(1004, ('zhangliu', None)), (1001, ('zhangsan', 'sales')), (1002, ('lisi', 'tech')), (1003, ('wangwu', None))]
>>> x.rightOuterJoin(y).collect()
[(1001, ('zhangsan', 'sales')), (1002, ('lisi', 'tech'))]