import pandas as pd
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql import functions as F, Window
# 配置集群
config = SparkConf()
# config.set('spark.dynamicAllocation.maxExecutors', '8')
# config.set('spark.driver.memory', '4G')
# config.set('spark.executor.memory', '8G')
# config.set('spark.executor.cores', '8')
# config.set('spark.yarn.executora.memoryOverhead', '4G')
# config.set('spark.sql.shuffle.partitions', '500')
# config.set('spark.default.parallelism', '500')
# config.set('spark.port.maxRetries', '1000')
# config.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
config.set('spark.master','local[4]')
spark = SparkSession.builder.config(conf=config).getOrCreate()
df = spark.createDataFrame([
{'user_id': 'A203', 'country': 'India', 'browser': 'Chrome', 'OS': 'WIN', 'age': 33},
{'user_id': 'A201', 'country': 'China', 'browser': 'Safari', 'OS': 'MacOs', 'age': 35},
{'user_id': 'A205', 'country': 'UK', 'browser': 'Mozilla', 'OS': 'Linux', 'age': 25}
])
/usr/lib/spark/python/pyspark/sql/session.py:346: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead
warnings.warn("inferring schema from dict is deprecated,"
df.show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
| WIN| 33| Chrome| India| A203|
|MacOs| 35| Safari| China| A201|
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
df.printSchema()
root
|-- OS: string (nullable = true)
|-- age: long (nullable = true)
|-- browser: string (nullable = true)
|-- country: string (nullable = true)
|-- user_id: string (nullable = true)
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.types
schema = StructType().add('user_id', StringType(), True).add('country', StringType(), True).add('browser', StringType(), True).add('OS', StringType(), True).add('age', IntegerType(), True)
df = spark.createDataFrame([
('A203', 'India', 'Chrome', 'WIN', 33),
('A201', 'China', 'Safari', 'MacOS', 35),
('A205', 'UK', 'Mozilla', 'Linux', 25),
], schema=schema)
df.show()
+-------+-------+-------+-----+---+
|user_id|country|browser| OS|age|
+-------+-------+-------+-----+---+
| A203| India| Chrome| WIN| 33|
| A201| China| Safari|MacOS| 35|
| A205| UK|Mozilla|Linux| 25|
+-------+-------+-------+-----+---+
df.printSchema()
root
|-- user_id: string (nullable = true)
|-- country: string (nullable = true)
|-- browser: string (nullable = true)
|-- OS: string (nullable = true)
|-- age: integer (nullable = true)
网上数据集
# 从pandas加载数据到spark里
salaries_df = spark.createDataFrame(pd.read_csv('salaries_emp_no_less_20000.csv')).cache()
employees_df = spark.createDataFrame(pd.read_csv('employees_emp_no_less_20000.csv')).cache()
salaries_df.show(5)
employees_df.show(5)
+------+------+----------+----------+
|emp_no|salary| from_date| to_date|
+------+------+----------+----------+
| 10001| 60117|1986-06-26|1987-06-26|
| 10001| 62102|1987-06-26|1988-06-25|
| 10001| 66074|1988-06-25|1989-06-25|
| 10001| 66596|1989-06-25|1990-06-25|
| 10001| 66961|1990-06-25|1991-06-25|
+------+------+----------+----------+
only showing top 5 rows
+------+----------+----------+---------+------+----------+
|emp_no|birth_date|first_name|last_name|gender| hire_date|
+------+----------+----------+---------+------+----------+
| 10001|1953-09-02| Georgi| Facello| M|1986-06-26|
| 10002|1964-06-02| Bezalel| Simmel| F|1985-11-21|
| 10003|1959-12-03| Parto| Bamford| M|1986-08-28|
| 10004|1954-05-01| Chirstian| Koblick| M|1986-12-01|
| 10005|1955-01-21| Kyoichi| Maliniak| M|1989-09-12|
+------+----------+----------+---------+------+----------+
only showing top 5 rows
df_na = spark.createDataFrame([
{'user_id': 'A203', 'country': None, 'browser': 'Chrome', 'OS': 'WIN', 'age': 33},
{'user_id': 'A201', 'country': 'China', 'browser': None, 'OS': 'MacOs', 'age': 35},
{'user_id': 'A205', 'country': 'UK', 'browser': 'Mozilla', 'OS': 'Linux', 'age': 25}
])
df_na.show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
| WIN| 33| Chrome| null| A203|
|MacOs| 35| null| China| A201|
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
# 数据分布
df_na.summary().show()
+-------+-----+-----------------+-------+-------+-------+
|summary| OS| age|browser|country|user_id|
+-------+-----+-----------------+-------+-------+-------+
| count| 3| 3| 2| 2| 3|
| mean| null| 31.0| null| null| null|
| stddev| null|5.291502622129181| null| null| null|
| min|Linux| 25| Chrome| China| A201|
| 25%| null| 25| null| null| null|
| 50%| null| 33| null| null| null|
| 75%| null| 35| null| null| null|
| max| WIN| 35|Mozilla| UK| A205|
+-------+-----+-----------------+-------+-------+-------+
# 空值填充为空字符串''
df_na.fillna('').show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
| WIN| 33| Chrome| | A203|
|MacOs| 35| | China| A201|
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
# 按列填充空值
df_na.fillna({'browser': 'unknown', 'country': 'unknown'}).show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
| WIN| 33| Chrome|unknown| A203|
|MacOs| 35|unknown| China| A201|
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
# 删除为空的数据
df_na.na.drop().show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
# 删除某列为空的数据
df_na.na.drop(subset='browser').show()
+-----+---+-------+-------+-------+
| OS|age|browser|country|user_id|
+-----+---+-------+-------+-------+
| WIN| 33| Chrome| null| A203|
|Linux| 25|Mozilla| UK| A205|
+-----+---+-------+-------+-------+
# 删除某列
df_na.drop('os').show()
+---+-------+-------+-------+
|age|browser|country|user_id|
+---+-------+-------+-------+
| 33| Chrome| null| A203|
| 35| null| China| A201|
| 25|Mozilla| UK| A205|
+---+-------+-------+-------+
# 使用group by的方式
employees_df.join(
salaries_df
,salaries_df['emp_no']==employees_df['emp_no']
).withColumn(
'gender_custom'
,F.when(employees_df['gender']=='M', 'male'
).otherwise('female')
).groupBy(
employees_df['emp_no']
,employees_df['birth_date']
,employees_df['first_name']
,employees_df['last_name']
,'gender_custom'
).agg(
F.max('salary').alias('max_salary')
,F.count('salary').alias('salary_change_times')
).select(
employees_df['emp_no']
,employees_df['birth_date']
,employees_df['first_name']
,employees_df['last_name']
,'gender_custom'
,'max_salary'
,'salary_change_times'
).where(
(employees_df['emp_no']>=10001) & (employees_df['emp_no']<=10005)
).orderBy(
'emp_no'
).show()
+------+----------+----------+---------+-------------+----------+-------------------+
|emp_no|birth_date|first_name|last_name|gender_custom|max_salary|salary_change_times|
+------+----------+----------+---------+-------------+----------+-------------------+
| 10001|1953-09-02| Georgi| Facello| male| 88958| 17|
| 10002|1964-06-02| Bezalel| Simmel| female| 72527| 6|
| 10003|1959-12-03| Parto| Bamford| male| 43699| 7|
| 10004|1954-05-01| Chirstian| Koblick| male| 74057| 16|
| 10005|1955-01-21| Kyoichi| Maliniak| male| 94692| 13|
+------+----------+----------+---------+-------------+----------+-------------------+
# 使用窗口函数 TopN
employees_df.join(
salaries_df
,salaries_df['emp_no']==employees_df['emp_no']
).withColumn(
'gender_custom'
,F.when(employees_df['gender']=='M', 'male'
).otherwise('female')
).withColumn(
'index'
,F.row_number().over(Window.partitionBy(employees_df['emp_no']).orderBy(salaries_df['salary'].desc()))
).filter(
F.col('index')==1
).select(
employees_df['emp_no']
,employees_df['birth_date']
,employees_df['first_name']
,employees_df['last_name']
,F.col('gender_custom')
,F.col('salary').alias('max_salary')
).where(
(employees_df['emp_no']>=10001) & (employees_df['emp_no']<=10005)
).orderBy(
'emp_no'
).show()
+------+----------+----------+---------+-------------+----------+
|emp_no|birth_date|first_name|last_name|gender_custom|max_salary|
+------+----------+----------+---------+-------------+----------+
| 10001|1953-09-02| Georgi| Facello| male| 88958|
| 10002|1964-06-02| Bezalel| Simmel| female| 72527|
| 10003|1959-12-03| Parto| Bamford| male| 43699|
| 10004|1954-05-01| Chirstian| Koblick| male| 74057|
| 10005|1955-01-21| Kyoichi| Maliniak| male| 94692|
+------+----------+----------+---------+-------------+----------+