pyspark | 数据处理基本操作

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql import functions as F, Window

初始化与配置环境

# 配置集群
config = SparkConf()
# config.set('spark.dynamicAllocation.maxExecutors', '8')
# config.set('spark.driver.memory', '4G')
# config.set('spark.executor.memory', '8G')
# config.set('spark.executor.cores', '8')
# config.set('spark.yarn.executora.memoryOverhead', '4G')
# config.set('spark.sql.shuffle.partitions', '500')
# config.set('spark.default.parallelism', '500')
# config.set('spark.port.maxRetries', '1000')
# config.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
config.set('spark.master','local[4]')

spark = SparkSession.builder.config(conf=config).getOrCreate()

创建DataFrame

字典方式创建

df = spark.createDataFrame([
    {'user_id': 'A203', 'country': 'India', 'browser': 'Chrome', 'OS': 'WIN', 'age': 33},
    {'user_id': 'A201', 'country': 'China', 'browser': 'Safari', 'OS': 'MacOs', 'age': 35},
    {'user_id': 'A205', 'country': 'UK', 'browser': 'Mozilla', 'OS': 'Linux', 'age': 25} 
])
/usr/lib/spark/python/pyspark/sql/session.py:346: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead
  warnings.warn("inferring schema from dict is deprecated,"
df.show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|  WIN| 33| Chrome|  India|   A203|
|MacOs| 35| Safari|  China|   A201|
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
df.printSchema()
root
 |-- OS: string (nullable = true)
 |-- age: long (nullable = true)
 |-- browser: string (nullable = true)
 |-- country: string (nullable = true)
 |-- user_id: string (nullable = true)

申明列类型创建

http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.types

schema = StructType().add('user_id', StringType(), True).add('country', StringType(), True).add('browser', StringType(), True).add('OS', StringType(), True).add('age', IntegerType(), True)


df = spark.createDataFrame([
    ('A203', 'India', 'Chrome', 'WIN', 33),
    ('A201', 'China', 'Safari', 'MacOS', 35),    
    ('A205', 'UK', 'Mozilla', 'Linux', 25),        
], schema=schema)

df.show()
+-------+-------+-------+-----+---+
|user_id|country|browser|   OS|age|
+-------+-------+-------+-----+---+
|   A203|  India| Chrome|  WIN| 33|
|   A201|  China| Safari|MacOS| 35|
|   A205|     UK|Mozilla|Linux| 25|
+-------+-------+-------+-----+---+
df.printSchema()
root
 |-- user_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- browser: string (nullable = true)
 |-- OS: string (nullable = true)
 |-- age: integer (nullable = true)

其他方式加载数据

网上数据集

  • employees_df: https://query.data.world/s/mbwaztyugiidkaw4z32ptikrwrjqyl
  • salaries_df: https://query.data.world/s/wraycqnu6fopv56tcr2lwlyaujlhtz
# 从pandas加载数据到spark里
salaries_df = spark.createDataFrame(pd.read_csv('salaries_emp_no_less_20000.csv')).cache()
employees_df = spark.createDataFrame(pd.read_csv('employees_emp_no_less_20000.csv')).cache()

salaries_df.show(5)
employees_df.show(5)
+------+------+----------+----------+
|emp_no|salary| from_date|   to_date|
+------+------+----------+----------+
| 10001| 60117|1986-06-26|1987-06-26|
| 10001| 62102|1987-06-26|1988-06-25|
| 10001| 66074|1988-06-25|1989-06-25|
| 10001| 66596|1989-06-25|1990-06-25|
| 10001| 66961|1990-06-25|1991-06-25|
+------+------+----------+----------+
only showing top 5 rows

+------+----------+----------+---------+------+----------+
|emp_no|birth_date|first_name|last_name|gender| hire_date|
+------+----------+----------+---------+------+----------+
| 10001|1953-09-02|    Georgi|  Facello|     M|1986-06-26|
| 10002|1964-06-02|   Bezalel|   Simmel|     F|1985-11-21|
| 10003|1959-12-03|     Parto|  Bamford|     M|1986-08-28|
| 10004|1954-05-01| Chirstian|  Koblick|     M|1986-12-01|
| 10005|1955-01-21|   Kyoichi| Maliniak|     M|1989-09-12|
+------+----------+----------+---------+------+----------+
only showing top 5 rows

空值处理

df_na = spark.createDataFrame([
    {'user_id': 'A203', 'country': None, 'browser': 'Chrome', 'OS': 'WIN', 'age': 33},
    {'user_id': 'A201', 'country': 'China', 'browser': None, 'OS': 'MacOs', 'age': 35},
    {'user_id': 'A205', 'country': 'UK', 'browser': 'Mozilla', 'OS': 'Linux', 'age': 25} 
])
df_na.show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|  WIN| 33| Chrome|   null|   A203|
|MacOs| 35|   null|  China|   A201|
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
# 数据分布
df_na.summary().show()
+-------+-----+-----------------+-------+-------+-------+
|summary|   OS|              age|browser|country|user_id|
+-------+-----+-----------------+-------+-------+-------+
|  count|    3|                3|      2|      2|      3|
|   mean| null|             31.0|   null|   null|   null|
| stddev| null|5.291502622129181|   null|   null|   null|
|    min|Linux|               25| Chrome|  China|   A201|
|    25%| null|               25|   null|   null|   null|
|    50%| null|               33|   null|   null|   null|
|    75%| null|               35|   null|   null|   null|
|    max|  WIN|               35|Mozilla|     UK|   A205|
+-------+-----+-----------------+-------+-------+-------+
# 空值填充为空字符串''
df_na.fillna('').show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|  WIN| 33| Chrome|       |   A203|
|MacOs| 35|       |  China|   A201|
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
# 按列填充空值
df_na.fillna({'browser': 'unknown', 'country': 'unknown'}).show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|  WIN| 33| Chrome|unknown|   A203|
|MacOs| 35|unknown|  China|   A201|
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
# 删除为空的数据
df_na.na.drop().show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
# 删除某列为空的数据
df_na.na.drop(subset='browser').show()
+-----+---+-------+-------+-------+
|   OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|  WIN| 33| Chrome|   null|   A203|
|Linux| 25|Mozilla|     UK|   A205|
+-----+---+-------+-------+-------+
# 删除某列
df_na.drop('os').show()
+---+-------+-------+-------+
|age|browser|country|user_id|
+---+-------+-------+-------+
| 33| Chrome|   null|   A203|
| 35|   null|  China|   A201|
| 25|Mozilla|     UK|   A205|
+---+-------+-------+-------+

类SQL操作

  • Select
  • Filter
  • Where
  • Aggregations

方案一: 求职工最大工资

# 使用group by的方式
employees_df.join(
    salaries_df
    ,salaries_df['emp_no']==employees_df['emp_no']
).withColumn(
    'gender_custom'
    ,F.when(employees_df['gender']=='M', 'male'
    ).otherwise('female')
).groupBy(
    employees_df['emp_no']
    ,employees_df['birth_date']    
    ,employees_df['first_name']    
    ,employees_df['last_name']
    ,'gender_custom'
).agg(
    F.max('salary').alias('max_salary')
    ,F.count('salary').alias('salary_change_times')
).select(
    employees_df['emp_no']
    ,employees_df['birth_date']
    ,employees_df['first_name']
    ,employees_df['last_name']
    ,'gender_custom'   
    ,'max_salary'
    ,'salary_change_times'     
).where(
    (employees_df['emp_no']>=10001) & (employees_df['emp_no']<=10005)
).orderBy(
    'emp_no'
).show()
+------+----------+----------+---------+-------------+----------+-------------------+
|emp_no|birth_date|first_name|last_name|gender_custom|max_salary|salary_change_times|
+------+----------+----------+---------+-------------+----------+-------------------+
| 10001|1953-09-02|    Georgi|  Facello|         male|     88958|                 17|
| 10002|1964-06-02|   Bezalel|   Simmel|       female|     72527|                  6|
| 10003|1959-12-03|     Parto|  Bamford|         male|     43699|                  7|
| 10004|1954-05-01| Chirstian|  Koblick|         male|     74057|                 16|
| 10005|1955-01-21|   Kyoichi| Maliniak|         male|     94692|                 13|
+------+----------+----------+---------+-------------+----------+-------------------+

方案二: 求职工最大工资

# 使用窗口函数 TopN
employees_df.join(
    salaries_df
    ,salaries_df['emp_no']==employees_df['emp_no']
).withColumn(
    'gender_custom'
    ,F.when(employees_df['gender']=='M', 'male'
    ).otherwise('female')
).withColumn(
    'index'
    ,F.row_number().over(Window.partitionBy(employees_df['emp_no']).orderBy(salaries_df['salary'].desc()))
).filter(
    F.col('index')==1
).select(
    employees_df['emp_no']
    ,employees_df['birth_date']
    ,employees_df['first_name']
    ,employees_df['last_name']
    ,F.col('gender_custom')
    ,F.col('salary').alias('max_salary')
).where(
    (employees_df['emp_no']>=10001) & (employees_df['emp_no']<=10005)
).orderBy(
    'emp_no'
).show()
+------+----------+----------+---------+-------------+----------+
|emp_no|birth_date|first_name|last_name|gender_custom|max_salary|
+------+----------+----------+---------+-------------+----------+
| 10001|1953-09-02|    Georgi|  Facello|         male|     88958|
| 10002|1964-06-02|   Bezalel|   Simmel|       female|     72527|
| 10003|1959-12-03|     Parto|  Bamford|         male|     43699|
| 10004|1954-05-01| Chirstian|  Koblick|         male|     74057|
| 10005|1955-01-21|   Kyoichi| Maliniak|         male|     94692|
+------+----------+----------+---------+-------------+----------+

你可能感兴趣的:(【Spark】)