from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark import SparkConf,SparkContext
url='local'
conf = SparkConf().setAppName("miniProject").setMaster(url).set("spark.sql.execution.arrow.enabled", "true")
sc = SparkContext.getOrCreate(conf)
#建立RDD最简单的方式就是使用sparkcontext的parallelize方法
rdd=sc.parallelize([(3,4),(3,6),(5,6),(1,2)])
#k-v RDD基本转换运算
rdd=sc.parallelize([(3,4),(3,6),(5,6),(1,2)])
rdd
rdd.collect()
[(3, 4), (3, 6), (5, 6), (1, 2)]
#获得key
rdd.keys().collect()
[3, 3, 5, 1]
#获得values
rdd.values().collect()
[4, 6, 6, 2]
#使用filter运算率先key运算
rdd.filter(lambda keyvalue:keyvalue[0]>3).collect()
#使用filter运算率先key运算
rdd.filter(lambda keyvalue:keyvalue[0]>3).collect()
[(5, 6)]
1
#使用filter运算率先values运算
rdd.filter(lambda keyvalue:keyvalue[1]>3).collect()
[(3, 4), (3, 6), (5, 6)]
#使用mapvalues进行运算
rdd.mapValues(lambda x:x+100).collect()
[(3, 104), (3, 106), (5, 106), (1, 102)]
#sortbykey从小到大依照key排列
rdd.sortByKey(ascending=True).collect()
[(1, 2), (3, 4), (3, 6), (5, 6)]
#reducebykey (相同的key值,把value相加)
rdd.reduceByKey(lambda x,y:x+y).collect()
[(5, 6), (1, 2), (3, 10)]
rdd.collect()
[(3, 4), (3, 6), (5, 6), (1, 2)]
1
rdd.takeOrdered(1)
[(1, 2)]
00
#多个rdd k-v 转换计算
rdd1=sc.parallelize([(3,4),(3,6),(5,6),(1,2)])
rdd2=sc.parallelize([(3,100),(1,200)])
#k-v join运算
rdd1.join(rdd2).collect()
[(1, (2, 200)), (3, (4, 100)), (3, (6, 100))]
#left join
rdd1.leftOuterJoin(rdd2).collect()
[(1, (2, 200)), (3, (4, 100)), (3, (6, 100)), (5, (6, None))]
#right join
rdd1.rightOuterJoin(rdd2).collect()
[(1, (2, 200)), (3, (4, 100)), (3, (6, 100))]
#subtractByKey 会移除相同key的资料
rdd1.subtractByKey(rdd2).collect()
[(5, 6)]
#k-v 动作操作
rdd.first()
(3, 4)
2
rdd.take(2)
[(3, 4), (3, 6)]
rdd.collect()
[(3, 4), (3, 6), (5, 6), (1, 2)]
0
rdd.first()[0]
3
#计算rdd中每一个k值的个数
rdd.countByKey()
rdd.countByKey()
defaultdict(int, {3: 2, 5: 1, 1: 1})
type(rdd.countByKey())
collections.defaultdict
rdd.countByKey().get(3)
rdd.countByKey().get(3)
2
#collectasmap 建立k-v字典
kv=rdd.collectAsMap()
kv
kv
{3: 6, 5: 6, 1: 2}
10
kv.get(10)
3
kv.get(3)
6
rdd.collect()
[(3, 4), (3, 6), (5, 6), (1, 2)]
3
kv[3]
6
3
#k-v lookup
rdd.lookup(3)
[4, 6]
1
rdd.lookup(1)
[2]