python Spark基础--Rdds Transformation

@Rdd transformation

对于Rdd的基本操作

Rdd floatMap,map,filter

准备数据集

import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, RDD
# 创建conf和sc
conf = SparkConf().setAppName('myApp').setMaster('local')
sc = SparkContext(conf=conf)
# 建立RDDs
lines = sc.parallelize(['hello','spark','hello','!'])
print(lines.collect())

ines数据集

floatMap

将数据压扁在同一平行级别

# rdd.floatMap
rdd_floatMap = lines.flatMap(lambda lines:lines.split(' '))
print(rdd_floatMap.collect())

rdd_floatMap

map

# rdd.map
rdd_map= lines.map(lambda word:(word,1))
rdd_map.foreach(print)

filter

# rdd.filter
rdd_filter = lines.filter(lambda word:'hello' in word)
rdd_filter .foreach(print)

Rdd distinct,union,intersection,substract

rdd1 = sc.parallelize(['coffe','coffe','panadas','tea'])
rdd1.foreach(print)
rdd2 = sc.parallelize(['monkey','kitty','coffe'])
# distinct
rdd_distinct = rdd1.distinct()
rdd_distinct.foreach(print)
# union并集
rdd_union = rdd1.union(rdd2)
rdd_union.foreach(print)
# intersection交集
rdd_inter = rdd1.intersection(rdd2)# type:RDD
print(rdd_inter.collect())
# substract差集,rdd1相对于rdd2的差集
rdd_sub = rdd1.subtract(rdd2) # type:RDD
print(rdd_sub.collect())

你可能感兴趣的:(python,spark,python,spark,RDDs)