SPARK-SQL 读取外部数据源 csv文件的读写

准备person.json文件

{"name":"Michael", "age":29}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

csv文件读写操作示例

import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.*;

public class test5 {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
                .builder()
                .config("spark.driver.host", "localhost")
                .appName("CSVFileTest")
                .master("local")
                .getOrCreate();

        spark.sparkContext().setLogLevel("ERROR");

        Dataset jsonDF = spark.read().json(Utils.BASE_PATH + "/people.json");
        jsonDF.show(false);
//                +---+-------+
//                | age | name |
//                +---+-------+
//                | 29 | Michael |
//                |30 | Andy |
//                |19 | Justin |
//                +---+-------+


        //将json文件数据转化成csv文件数据
        jsonDF.write().mode(SaveMode.Overwrite).csv(Utils.BASE_PATH + "/csv");

        Dataset csvDF = spark.read().csv(Utils.BASE_PATH + "/csv");
        csvDF.show(false);
//                +---+-------+
//                |_c0|_c1    |
//                +---+-------+
//                |29 |Michael|
//                |30 |Andy   |
//                |19 |Justin |
//                +---+-------+

        //从String类型中的Dataset来创建DataFrame
        List csvStr = Arrays.asList("23,jeffy,26/08/2015 18:00", "34,katy,27/10/2014 18:30");
        Dataset csvDS = spark.createDataset(csvStr, Encoders.STRING());
        csvDS.show(false);
//                +-------------------------+
//                |value                    |
//                +-------------------------+
//                |23,jeffy,26/08/2015 18:00|
//                |34,katy,27/10/2014 18:30 |
//                +-------------------------+

        //通过Dataset读取csv
        Dataset csvDFFromDS = spark.read().csv(csvDS);
        csvDFFromDS.show(false);
//                +---+-----+----------------+
//                |_c0|_c1  |_c2             |
//                +---+-----+----------------+
//                |23 |jeffy|26/08/2015 18:00|
//                |34 |katy |27/10/2014 18:30|
//                +---+-----+----------------+

        List fields = new ArrayList<>();
        StructField age = DataTypes.createStructField("age", DataTypes.IntegerType, true);
        StructField name = DataTypes.createStructField("name", DataTypes.StringType, true);
        StructField date = DataTypes.createStructField("date", DataTypes.DateType, true);
        fields.add(age);
        fields.add(name);
        fields.add(date);
        StructType customSchema = DataTypes.createStructType(fields);

        //设置读参数
        Map readOpts = new HashMap<>();
        //不读取列头
        readOpts.put("header", "false");
        //inferSchema:让框架推断csv文件的数据类型,慢,推荐用自定义Schema来进行代替
        readOpts.put("inferSchema",true);
        readOpts.put("comment", "~");
        //原始的数据集中日期是什么格式,就按什么格式读,否则解析不了
        readOpts.put("dateFormat", "dd/MM/yyyy HH:mm");
        //通过Dataset读取csv,同时带有schema信息 和 读取的参数options相关

        //********注意 schema和options最好同时存在,否则读取的数据可能为null********

        Dataset data = spark.read().schema(customSchema).options(readOpts).csv(csvDS);
        data.show(false);
//                +---+-----+----------+
//                |age|name |date      |
//                +---+-----+----------+
//                |23 |jeffy|2015-08-26|
//                |34 |katy |2014-10-27|
//                +---+-----+----------+

        //设置写参数
        Map writeOpts = new HashMap<>();
        writeOpts.put("comment", "~");
        writeOpts.put("compression", "gzip");
        //保存时,日期按这个格式存储
        writeOpts.put("dateFormat", "yyyy/MM/dd");
        data.write().mode(SaveMode.Overwrite).options(writeOpts).csv(Utils.BASE_PATH + "/csv_options");

        spark.read().csv(Utils.BASE_PATH + "/csv_options").show(false);
//                +---+-----+----------+
//                |_c0|_c1  |_c2       |
//                +---+-----+----------+
//                |23 |jeffy|2015/08/26|
//                |34 |katy |2014/10/27|
//                +---+-----+----------+
        spark.stop();
    }
}


 

你可能感兴趣的:(#,spark,spark)