@Rdd transformation
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, RDD
# 创建conf和sc
conf = SparkConf().setAppName('myApp').setMaster('local')
sc = SparkContext(conf=conf)
# 建立RDDs
lines = sc.parallelize(['hello','spark','hello','!'])
print(lines.collect())
将数据压扁在同一平行级别
# rdd.floatMap
rdd_floatMap = lines.flatMap(lambda lines:lines.split(' '))
print(rdd_floatMap.collect())
# rdd.map
rdd_map= lines.map(lambda word:(word,1))
rdd_map.foreach(print)
# rdd.filter
rdd_filter = lines.filter(lambda word:'hello' in word)
rdd_filter .foreach(print)
rdd1 = sc.parallelize(['coffe','coffe','panadas','tea'])
rdd1.foreach(print)
rdd2 = sc.parallelize(['monkey','kitty','coffe'])
# distinct
rdd_distinct = rdd1.distinct()
rdd_distinct.foreach(print)
# union并集
rdd_union = rdd1.union(rdd2)
rdd_union.foreach(print)
# intersection交集
rdd_inter = rdd1.intersection(rdd2)# type:RDD
print(rdd_inter.collect())
# substract差集,rdd1相对于rdd2的差集
rdd_sub = rdd1.subtract(rdd2) # type:RDD
print(rdd_sub.collect())