pyspark文件读写示例-(CSV/JSON/Parquet-单个或多个)

# 创建或获取会话
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Python Spark SQL example').getOrCreate()

读取单个文件

CSV

csv_path = '/home/ghost/workdata/patients.csv'
df_patient = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load(csv_path)
# 打印 dataframe 架构
df_patient.printSchema()
root
 |-- patient_id: integer (nullable = true)
 |-- assigned_sex: string (nullable = true)
 |-- given_name: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: integer (nullable = true)
 |-- bmi: double (nullable = true)

JSON

json_path = '/home/ghost/workdata/sparkify_log_small.json'
user_log = spark.read.format('json')\
    .load(json_path)
user_log.printSchema()
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

读取多个文件

某文件夹下所有文件 - 单级目录

方法类似,只是 path 参数支持通配符

all_csv_path = '/home/ghost/workdata/patients/*.csv'
df_patient =  spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load(all_csv_path)
df_patient.count()
403

某文件夹下所有文件 - 多级目录

all_json_path = '/home/ghost/workdata/song_data/*/*/*/*.json'
df_song =  spark.read.format('json').load(all_json_path)
print(f"total records:{df_song.count()}")
total records:71

通过以上实例可知,通配符不仅可以用在文件名上,也可用在目录名上,非常强大

指定多个文件

通配符和指定文件名结合,给 path 参数传个 list 即可

multi_csv_path = [
    '/home/ghost/workdata/patients/*.csv',
    '/home/ghost/workdata/patients-2.csv'
]
df_patient = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load(multi_csv_path)
df_patient.count()
503

选择与展示

DateFrame选择与展示

# patient_id列显示前 5 行
df_patient.select('patient_id').limit(5).show()
+----------+
|patient_id|
+----------+
|       201|
|       202|
|       203|
|       204|
|       205|
+----------+
# 所有列显示第一行
df_patient.limit(1).collect()
[Row(patient_id=201, assigned_sex='male', given_name='Kifle', surname='Mustafa', address='4535 Prospect Street', city='Pennsauken', state='NJ', zip_code=8110, country='United States', contact='[email protected]', birthdate='10/9/1965', weight=186.1, height=69, bmi=27.5)]
# 部分列显示前 5 行
df_patient.select(['patient_id','bmi']).take(5)
df_patient.select('patient_id','bmi').take(5)
[Row(patient_id=201, bmi=27.5),
 Row(patient_id=202, bmi=22.8),
 Row(patient_id=203, bmi=26.1),
 Row(patient_id=204, bmi=37.6),
 Row(patient_id=205, bmi=32.9)]

SQL接口展示

# 创建视图供SQL访问
df_patient.createOrReplaceTempView('dfTable_patient')
# 用 SQL 访问 DataFrame 视图
spark.sql('Select * From dfTable_patient Limit 5').limit(2).select('patient_id').show()
+----------+
|patient_id|
+----------+
|       201|
|       202|
+----------+

从以上例子可以发现,sql接口和DataFrame接口可无缝衔接配合使用

# Spark 的 SQL 接口支持大多数SQL标准语法和函数
spark.sql('''Select assigned_sex,count(*) As CNT
             From dfTable_patient
             Group By assigned_sex
          ''').show()
+------------+---+
|assigned_sex|CNT|
+------------+---+
|      female|250|
|        male|253|
+------------+---+

写文件

写文件也非常方便,代码模式为: df_xxx.write.format(xxx).mode(xxx).option(xxx).save(xxx)
保存的文件名实际上是一个文件夹,里面含多个子文件,方便多个节点并行写
写文件还支持按某字段进行分区保存,相关例子见文档

# 写 csv
df_song.write.format('csv')\
    .mode('overwrite')\
    .option('sep','|')\
    .option('header',True)\
    .save('/home/ghost/workdata/out-song.csv')
# 虽然 out-song.csv 实际上是一个文件夹,但依然可以正常读回来
df_test = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .option('sep', '|')\
    .load('/home/ghost/workdata/out-song.csv')
# 写 json,用 repartition 方法保存到3个子文件中
df_patient.repartition(3).write.format('json')\
    .mode('overwrite')\
    .save('/home/ghost/workdata/out-patient.json')

Parquet读写

# 写 parquet
df_patient.write\
    .partitionBy('assigned_sex')\
    .parquet('/home/ghost/workdata/out-patient.parquet',mode = 'overwrite')
# 查看 parquet 文件目录
ls -l "/home/ghost/workdata/out-patient.parquet"
total 8
drwxrwxr-x 2 ghost ghost 4096 8月  10 21:44 'assigned_sex=female'
drwxrwxr-x 2 ghost ghost 4096 8月  10 21:44 'assigned_sex=male'
# 读 parquet
df_parquent = spark.read.format('parquet')\
    .load('/home/ghost/workdata/out-patient.parquet')

# 读 parquet,仅某一分区
df_parquent_male = spark.read.format('parquet')\
    .load('/home/ghost/workdata/out-patient.parquet/assigned_sex=male')

你可能感兴趣的:(Python,ETL/BI/大数据)