Spark练习
pyspark => Spark 2.4 , local
编写框架
from pyspark import SparkConf,SparkContext
# 创建SparkConf:设置的是Spark相关参数信息
conf = SparkConf().setMaster("local[2]").setAppName("{}")
# 创建SparkContext
sc = SparkContext(conf = conf)
# 业务逻辑
pass
# 记得关闭是一种很好的习惯
sc.stop()
map算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('map')
sc = SparkContext(conf = conf)
rdd1 = sc.parallelize([1,2,3,4,5]).map(lambda x: x + 1)
print(rdd1.collect())
rdd2 = sc.parallelize(["dog","tiger","cat","tiger","tiger","cat"]).map(lambda x: (x,1)).reduceByKey(lambda x,y:x+y)
print(rdd2.collect())
sc.stop()
flatmap算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('flatmap')
sc = SparkContext(conf = conf)
print(sc.parallelize(["hello world","hello python","hello spark"])
.flatMap(lambda x:x.split(' '))
.map(lambda x:(x,1))
.reduceByKey(lambda x,y: x+y)
.collect())
sc.stop()
filter算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('filter')
sc = SparkContext(conf = conf)
rdd = sc.parallelize([1,2,3,4,5,6,7,8,9]).filter(lambda x: x>4)
print(rdd.collect())
sc.stop()
groupBy算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('groupby')
sc = SparkContext(conf = conf)
print(sc.parallelize(["hello world", "hello python", "hello spark"])
.flatMap(lambda x: x.split(' '))
.map(lambda x: (x, 1))
.groupByKey()
.map(lambda x:{x[0]:list(x[1])})
.collect()
)
sc.stop()
join算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('join')
sc = SparkContext(conf = conf)
a = sc.parallelize([('a',1),('b',2),('c',3),('c',4)])
b = sc.parallelize([('a',2),('b',3),('e',5)])
print("a join b",a.join(b).collect())
print("a fullOuterJoin b",a.fullOuterJoin(b).collect())
print("a leftOuterJoin b",a.leftOuterJoin(b).collect())
print("a rightOuterJoin b",a.rightOuterJoin(b).collect())
sc.stop()
# a join b [('b', (2, 3)), ('a', (1, 2))]
# a fullOuterJoin b [('b', (2, 3)), ('c', (3, None)), ('c', (4, None)), ('a', (1, 2)), ('e', (None, 5))]
# a leftOuterJoin b [('b', (2, 3)), ('c', (3, None)), ('c', (4, None)), ('a', (1, 2))]
# a rightOuterJoin b [('b', (2, 3)), ('a', (1, 2)), ('e', (None, 5))]
sortByKey算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('sortByKey')
sc = SparkContext(conf = conf)
print(sc.parallelize(["hello world", "hello python", "hello spark"])
.flatMap(lambda x:x.split(' '))
.map(lambda x:(x,1))
.reduceByKey(lambda x,y: x+y)
.map(lambda x:(x[1],x[0]))
.sortByKey(False)
.map(lambda x:(x[1],x[0]))
.collect())
sc.stop()
union & distinct算子
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[2]').setAppName('union_distinct')
sc = SparkContext(conf = conf)
print(sc.parallelize([1,1,2,3])
.union(sc.parallelize([2,3,3,4]))
.collect()
)
print(sc.parallelize([1, 1, 2, 3])
.distinct()
.collect()
)
sc.stop()
实战:
1:写一个简单的WordCount
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[4]').setAppName('wordCount')
sc = SparkContext(conf = conf)
data = sc.textFile('d:/wd/')
output = data \
.flatMap(lambda line:line.split(' ')) \
.map(lambda x:(x,1)) \
.reduceByKey(lambda x,y: x+y ) \
.sortBy(lambda line:line[1],False) \
.collect()
# .saveAsTextFile()
for word,count in output:
print("{}\t{}".format(word,count))
sc.stop()
2: 统计电影平均评分 > 5
自己造的假数据...
数据
1 1488844 3 2005-09-06
1 822109 5 2005-05-13
1 885013 4 2005-10-19
1 30878 4 2005-12-26
1 823519 3 2004-05-03
1 1422244 3 2005-04-06
1 222104 5 2005-05-13
1 225013 4 2005-10-14
1 30272 4 2005-12-26
1 223514 3 2004-05-03
5 5422244 5 2005-04-06
5 222504 5 2005-05-55
5 225055 4 2005-50-54
5 50272 4 2005-52-26
5 225554 5 2004-05-05
2 2422244 5 2005-04-06
2 222204 5 2005-05-25
2 225025 4 2005-20-24
2 50272 4 2005-22-26
2 225524 5 2004-05-05
代码
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf().setMaster('local[4]').setAppName('topN')
sc = SparkContext(conf = conf)
# 统计平均评分 > 5
data = sc.textFile('d:/wd/file.txt')
print(data
.map(lambda line:line.split(' '))
.map(lambda line:(line[0],int(line[2])))
.combineByKey((lambda x:(x,1)),
(lambda x,y: (x[0]+y,x[1]+1)),
(lambda x,y:(x[0]+y[0],x[1]+y[1])))
.map(lambda line:(line[0],(line[1][0]/line[1][1])))
.filter(lambda line:line[1]>4)
.collectAsMap()
)
sc.stop()
如果取Top n 的话可以使用take() 算子
combineByKey算子:
combineByKey算子:
该算子三个参数,分别为三个函数 createCombiner , mergeValue , mergeCombiners