PySpark Rdd操作

from pyspark import SparkContext, SparkConf


conf = SparkConf().setAppName("rdd_trans")
sc = SparkContext(conf=conf)
print(sc.version)

# 创建
rdd = sc.parallelize([2, 3, 4, 5, 6, 6, 6, 6, 6])
rdd1 = rdd.map(lambda x: x + 1)
print(rdd1.collect())
# [3, 4, 5, 6, 7]

# Trans
#filter
rdd2 = rdd.filter(lambda x: x % 2 == 0)
print(rdd2.collect())
# #  [2, 4, 6]

#flatMap
rdd3 = rdd.flatMap(lambda x: (x, x % 2 == 0))
print(rdd3.collect())
# [2, True, 3, False, 4, True, 5, False, 6, True, 6, True, 6, True, 6, True, 6, True]

# distinct
rdd4 = rdd.distinct().collect()
print(rdd4)
# [4, 5, 2, 6, 3]

# sample
# Fasle 不放回 0.6抽样比例,seed 随机种子
rdd5 = rdd.sample(False,0.6,666).collect()
print(rdd5)
# [4, 5, 2, 6, 3]

# leftOuterJoin  rightOuterJoin
rdd1 = sc.parallelize([("a", 1), ("b", 10), ("c", 3)])
rdd2 = sc.parallelize([("a", 2), ("c", 4), ("b", 5), ("d", 4)])


rdd3 = rdd1.leftOuterJoin(rdd2)
print(rdd3.collect())
# [('a', (1, 2)), ('b', (10, 5)), ('c', (3, 4))]


rdd4 = rdd2.leftOuterJoin(rdd1).collect()
print(rdd4)
# [('a', (2, 1)), ('c', (4, 3)), ('b', (5, 10)), ('d', (4, None))]


rdd5 = rdd1.rightOuterJoin(rdd2)
print(rdd5.collect())
# [('a', (1, 2)), ('b', (10, 5)), ('c', (3, 4)), ('d', (None, 4))]



# Actions
# 2 行动操作,执行完转换操作,即可执行行动操作
print("rdd1.count")
print(rdd1.count())
print("rdd1.take2")
print(rdd1.take(2))
print("rdd1.first")
print(rdd1.first())
print("rdd1.top")
print(rdd1.top(3))
print("takeSample 随机取两个数")
print(rdd1.takeSample(False, 2, 666))
rdd1.saveAsTextFile("/path/1.txt")


print("x+y is")
print(rdd1.reduce(lambda x, y: x + y))
# ('a', 1, 'b', 10, 'c', 3)


print("foreach")


print(rdd1.foreach(print))

你可能感兴趣的:(python,大数据)