from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("rdd_trans")
sc = SparkContext(conf=conf)
print(sc.version)
rdd = sc.parallelize([2, 3, 4, 5, 6, 6, 6, 6, 6])
rdd1 = rdd.map(lambda x: x + 1)
print(rdd1.collect())
rdd2 = rdd.filter(lambda x: x % 2 == 0)
print(rdd2.collect())
rdd3 = rdd.flatMap(lambda x: (x, x % 2 == 0))
print(rdd3.collect())
rdd4 = rdd.distinct().collect()
print(rdd4)
rdd5 = rdd.sample(False,0.6,666).collect()
print(rdd5)
rdd1 = sc.parallelize([("a", 1), ("b", 10), ("c", 3)])
rdd2 = sc.parallelize([("a", 2), ("c", 4), ("b", 5), ("d", 4)])
rdd3 = rdd1.leftOuterJoin(rdd2)
print(rdd3.collect())
rdd4 = rdd2.leftOuterJoin(rdd1).collect()
print(rdd4)
rdd5 = rdd1.rightOuterJoin(rdd2)
print(rdd5.collect())
print("rdd1.count")
print(rdd1.count())
print("rdd1.take2")
print(rdd1.take(2))
print("rdd1.first")
print(rdd1.first())
print("rdd1.top")
print(rdd1.top(3))
print("takeSample 随机取两个数")
print(rdd1.takeSample(False, 2, 666))
rdd1.saveAsTextFile("/path/1.txt")
print("x+y is")
print(rdd1.reduce(lambda x, y: x + y))
print("foreach")
print(rdd1.foreach(print))