pyspark sql、rdd实践

1. spark sql

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import math

sc = SparkContext()
ss = SparkSession(sc)
triple_rdd = sc.parallelize([['小明', '书', 12], ['小明','电影',4], ['小刚','书', 3]])
df = ss.createDataFrame(triple_rdd, ['user', 'item', 'w'])
# 借助rdd生成dataframe
df.show()
"""
+----+----+---+
|user|item|  w|
+----+----+---+
|小明|  书| 12|
|小明|电影|  4|
|小刚|  书|  3|
+----+----+---+
"""

func = F.udf(lambda x: 1/math.sqrt(x), FloatType())
df_q = df.groupBy('user').agg(F.sum('w'), F.collect_list('item'))\
         .withColumn('wu', func('sum(w)'))
df_q.show()
"""
+----+------+------------------+----------+
|user|sum(w)|collect_list(item)|        wu|
+----+------+------------------+----------+
|小明|    16|        [书, 电影]|      0.25|
|小刚|     3|              [书]|0.57735026|
+----+------+------------------+----------+
"""

df_ = df.join(df_q.select('user', 'wu'), on='user')
df_.show()
"""
+----+----+---+----------+
|user|item|  w|        wu|
+----+----+---+----------+
|小明|  书| 12|      0.25|
|小明|电影|  4|      0.25|
|小刚|  书|  3|0.57735026|
+----+----+---+----------+
"""
rdd = sc.parallelize([[3, 12], [5,4], [7, 3]])
df = ss.createDataFrame(rdd, ['col1', 'col2'])
df_ = df.withColumn('col2', df.col1+df.col2)
df_.show()

"""
+----+----+
|col1|col2|
+----+----+
|   3|  15|
|   5|   9|
|   7|  10|
+----+----+
"""

2. spark rdd

# 为user和item进行索引
from pyspark import SparkContext

triple_rdd = sc.parallelize([['小明', '书', 12], ['小明','电影',4], ['小刚','书', 3]])
u_idxmap = triple_rdd.map(lambda x: x[0]).distinct().zipWithIndex()
i_idxmap = triple_rdd.map(lambda x: x[1]).distinct().zipWithIndex()

n_u = u_idxmap.count()
i_u = i_idxmap.count()

i_idxmap = i_idxmap.mapValues(lambda x: x + i_u)

triple_rdd = triple_rdd.map(lambda x: (x[0], x[1:])).join(u_idxmap)\
                  .values().map(lambda x: (x[0][0], (x[0][1], x[1])))\
                  .join(i_idxmap).values().map(lambda x: [x[0][1], x[1], x[0][0]])

你可能感兴趣的:(pyspark sql、rdd实践)