- 1. RDD算子
-
- 1.1 文件 <=> rdd对象
- 1.2 map、foreach、mapPartitions、foreach Partitions
- 1.3 flatMap 先map再解除嵌套
- 1.4 reduceByKey、reduce、fold 分组聚合
- 1.5 mapValue 二元组value进行map操作
- 1.6 groupBy、groupByKey
- 1.7 filter、distinct 过滤筛选
- 1.8 union 合并
- 1.9 join、leftOuterJoin、rightOuterJoin 连接
- 1.10 intersection 交集
- 1.11 sortBy、sortByKey 排序
- 1.12 countByKey 统计key出现次数
- 1.13 first、take、top、count 取元素
- 1.14 takeOrdered 排序取前n个
- 1.15 takeSample 随机抽取
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('test')\
.setMaster('local[*]')
sc = SparkContext(conf=conf)
1. RDD算子
1.1 文件 <=> rdd对象
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
print(rdd.glom().collect(), rdd.getNumPartitions())
rdd = sc.textFile("./data.csv")
print(rdd.collect())
rdd = sc.parallelize([1, 2, 3], 3)
rdd.saveAsTextFile('./output')
'''
生成output文件夹
里面有按分区存储的多个文件
'''
1.2 map、foreach、mapPartitions、foreach Partitions
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
rdd2 = rdd.map(lambda x: (x, 1))
print(rdd2.map(lambda x: x[0] + x[1]).collect())
rdd = sc.parallelize([1, 2, 3])
rdd.foreach(lambda x: print(x))
rdd.foreach(lambda x: -x)
rdd.collect()
'''
map 一次调出一个元素进行计算,io次数多
mapPartitions 一次将一个分区的所有元素调出计算s
'''
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
def func(iter):
res = list()
for it in iter:
res.append(it * 10)
return res
rdd.mapPartitions(func).collect()
1.3 flatMap 先map再解除嵌套
rdd = sc.textFile("./data.csv")
print(rdd.collect())
rdd.flatMap(lambda x: x.split(' ')).collect()
1.4 reduceByKey、reduce、fold 分组聚合
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
print(rdd.reduceByKey(lambda a, b: a + b).collect())
rdd = sc.parallelize(range(1, 3))
print(rdd.reduce(lambda a, b: a + b))
print(sc.parallelize([('a', 1), ('a', 1)]).reduce(lambda a, b: a + b))
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
print(rdd.fold(10, lambda a, b: a + b))
'''
[[1, 2], [3, 4], [5, 6]]
10 + 1 + 2 = 13
10 + 3 + 4 = 17
10 + 5 + 6 = 21
10 + 13 + 17 + 21 = 61
> 61
'''
1.5 mapValue 二元组value进行map操作
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd.mapValues(lambda x: x * 10).collect()
1.6 groupBy、groupByKey
- groupBy、groupByKey、reduceByKey区别
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3), ('b', 4)])
rdd2 = rdd.groupBy(lambda x: x[0])
print(rdd2.collect())
'''
返回的是迭代器,需进一步转换
[('a', ),
('b', )]
'''
rdd3 = rdd2.map(lambda x: (x[0], list(x[1])))
print(rdd3.collect())
'''
[('a', [('a', 1), ('a', 2)]),
('b', [('b', 3), ('b', 4)])]
'''
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3), ('b', 4)])
rdd2 = rdd.groupByKey()
rdd2.map(lambda x: (x[0], list(x[1]))).collect()
1.7 filter、distinct 过滤筛选
rdd = sc.parallelize([1, 2, 3, 4, 5])
rdd.filter(lambda x: x > 3).collect()
rdd = sc.parallelize([1, 1, 1, 1, 2, 3, 'a', 'a'])
rdd.distinct().collect()
1.8 union 合并
rdd_a = sc.parallelize([1, 1, 2, 3])
rdd_b = sc.parallelize([2, 3, ('a', 1), ('b', 2)])
rdd_a.union(rdd_b).collect()
1.9 join、leftOuterJoin、rightOuterJoin 连接
rdd_a = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd_b = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
print(rdd_a.join(rdd_b).collect())
'''
内连接 取交集
[('b', (3, 2)),
('a', (1, 1)),
('a', (2, 1))]
'''
print(rdd_a.leftOuterJoin(rdd_b).collect())
'''
左连接 取交集和左边全部
[('b', (3, 2)),
('a', (1, 1)),
('a', (2, 1))]
'''
print(rdd_a.rightOuterJoin(rdd_b).collect())
'''
右连接 取交集和右边全部
[('b', (3, 2)),
('c', (None, 3)),
('a', (1, 1)),
('a', (2, 1))]
'''
1.10 intersection 交集
rdd_a = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd_b = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
rdd_a.intersection(rdd_b).collect()
1.11 sortBy、sortByKey 排序
rdd = sc.parallelize([[1, 2, 3],
[7, 8, 9],
[4, 5, 6]])
rdd.sortBy(lambda x: x[1], ascending=True, numPartitions=3).collect()
'''
[[1, 2, 3],
[4, 5, 6],
[7, 8, 9]]
'''
'''
ascending True升序,False降序
numPartitions 全局有序要设为1,否则只能保证分区内有序
keyfunc 对key进行处理,再排序
'''
rdd = sc.parallelize([('a', 1), ('c', 2), ('B', 3)])
print(rdd.sortByKey(ascending=True, numPartitions=1).collect())
'''
[('B', 3), ('a', 1), ('c', 2)]
'''
print(rdd.sortByKey(ascending=True, numPartitions=1, keyfunc=lambda k: str(k).lower()).collect())
'''
[('a', 1), ('B', 3), ('c', 2)]
'''
1.12 countByKey 统计key出现次数
rdd = sc.parallelize([('a', 1, 2), ('a'), ('b', 1)])
rdd.countByKey()
1.13 first、take、top、count 取元素
rdd = sc.parallelize([('a', 1, 2), ('a'), ('b', 1)])
print(rdd.first() )
print(rdd.take(2))
print(rdd.count())
rdd = sc.parallelize([2, 4, 1, 6])
print(rdd.top(2))
1.14 takeOrdered 排序取前n个
'''
param1: n
param2: func取数前更改元素,不更改元素本身,
不传func,默认升序(取前n最小值)
func = lambda x: -x 变为降序,取前n最大值,和top相同
'''
rdd = sc.parallelize([2, 4, 1, 6])
rdd.takeOrdered(2)
rdd.takeOrdered(2, lambda x: -x)
1.15 takeSample 随机抽取
'''
param1: True随机有放回抽样,Fasle不放回抽样
param2: 抽样个数
param3: 随机数种子
'''
rdd = sc.parallelize([1])
rdd.takeSample(True, 2)