在ipython notebook下运行pyspark
from pyspark import SparkConf, SparkContext
appName = 'testSpark'
def main(sc):
pass
if __name__ == '__main__':
#Configure Spark
conf = SparkConf().setAppName(appName).setMaster('local[2]')
# sc.stop()
sc = SparkContext(conf=conf)
print sc.version
main(sc)
2.0.2
在浏览器输入ip:4040进入到spark的任务UI界面,查看各任务的信息
pyspark-rdd
参数preservesPartitioning表示是否保留父RDD的partitioner分区信息
map
map(f, preservesPartitioning=False)
Return a new RDD by applying a function to each element of this RDD.
#map function
x = sc.parallelize([1,2,3,4])
y = x.map(lambda x:(x, x**3))
print y.collect()
[(1, 1), (2, 8), (3, 27), (4, 64)]
flatMap
flatMap(f, preservesPartitioning=False)
Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
z = x.flatMap(lambda x: (x, 100*x, x**2))
print z.collect()
[1, 100, 1, 2, 200, 4, 3, 300, 9, 4, 400, 16]
glom
glom()
Return an RDD created by coalescing all elements within each partition into a list.
rdd = sc.parallelize([1, 2, 3, 4], 2)
print sorted(rdd.glom().collect())
[[1, 2], [3, 4]]
mapPartitions
mapPartitions(f, preservesPartitioning=False)
Return a new RDD by applying a function to each partition of this RDD.
x = sc.parallelize([1,2,3,4], 2)
def f(iter):
yield sum(iter)
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print 'x原来分区信息:{0}'.format(x.glom().collect())
print 'x经过f计算后的结果:{}'.format(y.glom().collect())
x原来分区信息:[[1, 2], [3, 4]]
x经过f计算后的结果:[[3], [7]]
mapPartitionsWithIndex
mapPartitionsWithIndex(f, preservesPartitioning=False)¶
Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition.
x = sc.parallelize([1, 2, 3, 4], 2)
def f(splitIndex, iterator): yield (splitIndex, sum(iterator))
y = x.mapPartitionsWithIndex(f)
print 'x原来分区信息:{0}'.format(x.glom().collect())
print 'x经过f计算后的结果:{}'.format(y.glom().collect())
x原来分区信息:[[1, 2], [3, 4]]
x经过f计算后的结果:[[(0, 3)], [(1, 7)]]
getNumsPartitions
getNumPartitions()
Returns the number of partitions in RDD
rdd = sc.parallelize([1, 2, 3, 4], 2)
print '分区有{}个'.format(rdd.getNumPartitions())
分区有2个
filter
filter(f)
Return a new RDD containing only the elements that satisfy a predicate.
rdd = sc.parallelize([1, 2, 3, 4, 5])
res = rdd.filter(lambda x: x % 2 == 0).collect()
print '符合条件的数据是:{}'.format(res)
符合条件的数据是:[2, 4]
distinct
distinct(numPartitions=None)
Return a new RDD containing the distinct elements in this RDD.
res = sorted(sc.parallelize([1, 1, 1, 2, 3, 2, 3]).distinct().collect())
print '去重后的结果:{}'.format(res)
去重后的结果:[1, 2, 3]
sample
sample(withReplacement, fraction, seed=None)
Return a sampled subset of this RDD.
Parameters:
withReplacement – can elements be sampled multiple times (replaced when sampled out)
fraction – expected size of the sample as a fraction of this RDD’s size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0
seed – seed for the random number generator
rdd = sc.parallelize(range(7), 2)
samList = [rdd.sample(False, 0.5) for i in range(5)]
print 'rdd.collect()的值是{}'.format(rdd.collect())
for index, d in zip(range(len(samList)), samList):
print 'sample: {0} y = {1}'.format(index, d.collect())
takeSample
takeSample(withReplacement, num, seed=None)¶
Return a fixed-size sampled subset of this RDD.
Note that this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver’s memory.
rdd = sc.parallelize(range(15), 2)
samList = [rdd.takeSample(False, 4) for i in range(5)]
print 'rdd.collect()的值是{}'.format(rdd.glom().collect())
for index, d in zip(range(len(samList)), samList):
print 'sample: {0} y = {1}'.format(index, d)
rdd.collect()的值是[[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12, 13, 14]]
sample: 0 y = [8, 9, 7, 2]
sample: 1 y = [12, 1, 10, 4]
sample: 2 y = [8, 12, 2, 6]
sample: 3 y = [9, 8, 12, 14]
sample: 4 y = [10, 4, 8, 2]
union
union(other)
Return the union of this RDD and another one.
rdd = sc.parallelize([1, 1, 2, 3])
rdd1 = sc.parallelize([5, 3, 4, 6])
print rdd.union(rdd1).collect()
[1, 1, 2, 3, 5, 3, 4, 6]
intersection
intersection(other)
Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.
Note that this method performs a shuffle internally.
rdd = sc.parallelize([1, 1, 2, 3])
rdd1 = sc.parallelize([5, 3, 4, 6])
print rdd.intersection(rdd1).collect()
[3]
sortByKey
sortByKey(ascending=True, numPartitions=None, keyfunc=func)
Sorts this RDD, which is assumed to consist of (key, value) pairs.
tmp = [('a', 1), ('f', 2), ('d', 3), ('c', 4), ('b', 5)]
rdd = sc.parallelize(tmp, 2)
print rdd.glom().collect()
sort1 = rdd.sortByKey(True,1).glom().collect()
sort2 = rdd.sortByKey(True,3).glom().collect()
print sort1
print sort2
[[('a', 1), ('f', 2)], [('d', 3), ('c', 4), ('b', 5)]]
[[('a', 1), ('b', 5), ('c', 4), ('d', 3), ('f', 2)]]
[[('a', 1), ('b', 5)], [('c', 4), ('d', 3)], [('f', 2)]]