# 创建或获取会话
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Python Spark SQL example').getOrCreate()
csv_path = '/home/ghost/workdata/patients.csv'
df_patient = spark.read.format('csv')\
.option('header', 'true')\
.option('inferSchema', 'true')\
.load(csv_path)
# 打印 dataframe 架构
df_patient.printSchema()
root
|-- patient_id: integer (nullable = true)
|-- assigned_sex: string (nullable = true)
|-- given_name: string (nullable = true)
|-- surname: string (nullable = true)
|-- address: string (nullable = true)
|-- city: string (nullable = true)
|-- state: string (nullable = true)
|-- zip_code: integer (nullable = true)
|-- country: string (nullable = true)
|-- contact: string (nullable = true)
|-- birthdate: string (nullable = true)
|-- weight: double (nullable = true)
|-- height: integer (nullable = true)
|-- bmi: double (nullable = true)
json_path = '/home/ghost/workdata/sparkify_log_small.json'
user_log = spark.read.format('json')\
.load(json_path)
user_log.printSchema()
root
|-- artist: string (nullable = true)
|-- auth: string (nullable = true)
|-- firstName: string (nullable = true)
|-- gender: string (nullable = true)
|-- itemInSession: long (nullable = true)
|-- lastName: string (nullable = true)
|-- length: double (nullable = true)
|-- level: string (nullable = true)
|-- location: string (nullable = true)
|-- method: string (nullable = true)
|-- page: string (nullable = true)
|-- registration: long (nullable = true)
|-- sessionId: long (nullable = true)
|-- song: string (nullable = true)
|-- status: long (nullable = true)
|-- ts: long (nullable = true)
|-- userAgent: string (nullable = true)
|-- userId: string (nullable = true)
方法类似,只是 path 参数支持通配符
all_csv_path = '/home/ghost/workdata/patients/*.csv'
df_patient = spark.read.format('csv')\
.option('header', 'true')\
.option('inferSchema', 'true')\
.load(all_csv_path)
df_patient.count()
403
all_json_path = '/home/ghost/workdata/song_data/*/*/*/*.json'
df_song = spark.read.format('json').load(all_json_path)
print(f"total records:{df_song.count()}")
total records:71
通过以上实例可知,通配符不仅可以用在文件名上,也可用在目录名上,非常强大
通配符和指定文件名结合,给 path 参数传个 list 即可
multi_csv_path = [
'/home/ghost/workdata/patients/*.csv',
'/home/ghost/workdata/patients-2.csv'
]
df_patient = spark.read.format('csv')\
.option('header', 'true')\
.option('inferSchema', 'true')\
.load(multi_csv_path)
df_patient.count()
503
# patient_id列显示前 5 行
df_patient.select('patient_id').limit(5).show()
+----------+
|patient_id|
+----------+
| 201|
| 202|
| 203|
| 204|
| 205|
+----------+
# 所有列显示第一行
df_patient.limit(1).collect()
[Row(patient_id=201, assigned_sex='male', given_name='Kifle', surname='Mustafa', address='4535 Prospect Street', city='Pennsauken', state='NJ', zip_code=8110, country='United States', contact='[email protected]', birthdate='10/9/1965', weight=186.1, height=69, bmi=27.5)]
# 部分列显示前 5 行
df_patient.select(['patient_id','bmi']).take(5)
df_patient.select('patient_id','bmi').take(5)
[Row(patient_id=201, bmi=27.5),
Row(patient_id=202, bmi=22.8),
Row(patient_id=203, bmi=26.1),
Row(patient_id=204, bmi=37.6),
Row(patient_id=205, bmi=32.9)]
# 创建视图供SQL访问
df_patient.createOrReplaceTempView('dfTable_patient')
# 用 SQL 访问 DataFrame 视图
spark.sql('Select * From dfTable_patient Limit 5').limit(2).select('patient_id').show()
+----------+
|patient_id|
+----------+
| 201|
| 202|
+----------+
从以上例子可以发现,sql接口和DataFrame接口可无缝衔接配合使用
# Spark 的 SQL 接口支持大多数SQL标准语法和函数
spark.sql('''Select assigned_sex,count(*) As CNT
From dfTable_patient
Group By assigned_sex
''').show()
+------------+---+
|assigned_sex|CNT|
+------------+---+
| female|250|
| male|253|
+------------+---+
写文件也非常方便,代码模式为: df_xxx.write.format(xxx).mode(xxx).option(xxx).save(xxx)
保存的文件名实际上是一个文件夹,里面含多个子文件,方便多个节点并行写
写文件还支持按某字段进行分区保存,相关例子见文档
# 写 csv
df_song.write.format('csv')\
.mode('overwrite')\
.option('sep','|')\
.option('header',True)\
.save('/home/ghost/workdata/out-song.csv')
# 虽然 out-song.csv 实际上是一个文件夹,但依然可以正常读回来
df_test = spark.read.format('csv')\
.option('header', 'true')\
.option('inferSchema', 'true')\
.option('sep', '|')\
.load('/home/ghost/workdata/out-song.csv')
# 写 json,用 repartition 方法保存到3个子文件中
df_patient.repartition(3).write.format('json')\
.mode('overwrite')\
.save('/home/ghost/workdata/out-patient.json')
# 写 parquet
df_patient.write\
.partitionBy('assigned_sex')\
.parquet('/home/ghost/workdata/out-patient.parquet',mode = 'overwrite')
# 查看 parquet 文件目录
ls -l "/home/ghost/workdata/out-patient.parquet"
total 8
drwxrwxr-x 2 ghost ghost 4096 8月 10 21:44 'assigned_sex=female'
drwxrwxr-x 2 ghost ghost 4096 8月 10 21:44 'assigned_sex=male'
# 读 parquet
df_parquent = spark.read.format('parquet')\
.load('/home/ghost/workdata/out-patient.parquet')
# 读 parquet,仅某一分区
df_parquent_male = spark.read.format('parquet')\
.load('/home/ghost/workdata/out-patient.parquet/assigned_sex=male')