spark = SparkSession.builder.appName('df_action').getOrCreate()
pairs = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
df = spark.createDataFrame(pairs, ['number', 'name'])
triplets = [("alex", "Ames", 20),
("alex", "Sunnyvale", 30),
("alex", "Cupertino", 40),
("mary", "Ames", 35),
("mary", "Stanford", 45),
("mary", "Campbell", 55),
("jeff", "Ames", 60),
("jeff", "Sunnyvale", 70),
("jane", "Austin", 80)]
df1 = spark.createDataFrame(triplets, ["name", "city", "age"])
df.describe().show()
"""
+-------+-----------------+----+
|summary| number|name|
+-------+-----------------+----+
| count| 9| 9|
| mean|5.222222222222222|null|
| stddev|3.073181485764296|null|
| min| 1| z1|
| max| 10| z9|
+-------+-----------------+----+
"""
df1_drop = df1.drop('city')
df1_drop.show()
方法和pandas的merge相似
df1.join(df_join, df1.name == df_join.name, 'cross')
# join 方式
# inner, cross, outer, full, full_outer, left,
# left_outer, right, right_outer, left_semi,
# and left_anti
方法和sql一样
df.createOrReplaceTempView('df_name')
spark.sql('select name, count(1) from df_name
group by name').show()
df_withcol = df1.withColumn('age2', df1.age + 2)
df_withcol.show()
# 同样可以用SQL方法实现
df.createOrReplaceTempView('df_name')
spark.sql('select *, age + 2 as agw2 from df_name').show()
def tokenize(record):
tokens = record.split(' ')
mylist = []
for word in tokens:
if len(word) > 2:
mylist.append(word)
return mylist
b = [('p', 50), ('x', 60), ('y', 70), ('z', 80) ]
a = [('a', 2), ('b', 3), ('c', 4)]
rdd_a = spark.sparkContext.parallelize(a)
rdd_b = spark.sparkContext.parallelize(b)
卡迪尔叠加
cart = rdd_a.cartesian(rdd_b)
rdd_f = rdd_b.filter(lambda x: x[1] > 55)
list_of_strings = ['of', 'a fox jumped',
'fox jumped of fence', 'a foxy fox jumped high']
rdd_flat = spark.sparkContext.parallelize(list_of_strings)
rdd_flated = rdd_flat.flatMap(lambda rec: tokenize(rec))
source_pairs = [(1, "u"), (1, "v"), (2, "a"), (3, "b"), (4, "z1")]
source = spark.sparkContext.parallelize(source_pairs)
other_pairs = [(1, "x"), (1, "y"), (2, "c"), (2, "d"), (3, "m"), (8, "z2")]
other = spark.sparkContext.parallelize(other_pairs)
joined = source.join(other)
# [(1, ('u', 'x')), (1, ('u', 'y')), (1, ('v', 'x')), (1, ('v', 'y')), (2, ('a', 'c')), (2, ('a', 'd')), (3, ('b', 'm'))]
numbers = ["10,20,3,4",
"3,5,6,30,7,8",
"4,5,6,7,8",
"3,9,10,11,12",
"6,7,13",
"5,6,7,12",
"5,6,7,8,9,10",
"11,12,13,14,15,16,17"]
# len(numbers) == 8
rdd_em = spark.sparkContext.parallelize(numbers, 10)
min_max_count_rdd = rdd_em.mapPartitions(min_max_count)
def min_max_count(iterator: iter) -> list:
try:
n = 0
for ite_i in iterator:
n += 1
numbers = ite_i.split(",")
# convert strings to integers
numbers = list(map(int, numbers))
print(numbers)
if n == 1 :
local_min = min(numbers)
local_max = max(numbers)
local_count = len(numbers)
else: # 处理partition小于时(含多个iter)
if local_min > min(numbers):
local_min = min(numbers)
if local_max < min(numbers):
local_max = min(numbers)
local_count += len(numbers)
return [(local_min, local_max, local_count)]
except : # 处理partition 为空时
# WHERE min > max to filter it out later
return [(1, -1, 0)]
pairs = [(10, "z1"), (1, "z2"), (2, "z3"), (9, "z4"), (3, "z5"), (4, "z6"), (5, "z7"), (6, "z8"), (7, "z9")]
rdd_st = spark.sparkContext.parallelize(pairs)
print('值排序,倒序', rdd_st.sortBy(lambda x:x[1], ascending = False).collect())
print('主键排序,倒序', rdd_st.sortByKey(ascending=False).collect())
bottom3 = rdd_st.takeOrdered(3, key=lambda x: -x[0]) # return list
print("bottom3 = ", bottom3)
参考: https://github.com/mahmoudparsian/pyspark-algorithms