SPARK-SQL - DataFrame创建方式汇总

创建DataFrame的方式

  1. 从JavaRDD与类类型中创建
  2. 从List与类类型中创建
  3. 从JavaRDD与schema中创建
  4. 从List与schema中创建
  5. 从外部数据源中创建 如spark.read().json等
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import pojo.Dog;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class test_21 {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
                .builder()
                .config("spark.driver.host", "localhost")
                .appName("DataFrameCreation")
                .master("local")
                .getOrCreate();

        spark.sparkContext().setLogLevel("ERROR");

        JavaSparkContext javaSparkContext = new JavaSparkContext(spark.sparkContext());

        Dog dog1 = new Dog("jitty", "red");
        Dog dog2 = new Dog("mytty", "yellow");
        List list = Arrays.asList(dog1, dog2);

        JavaRDD dogJavaRDD = javaSparkContext.parallelize(list);
        //1:从JavaRDD中创建
        Dataset dogDf = spark.createDataFrame(dogJavaRDD, Dog.class);
        dogDf.printSchema();
        dogDf.show();

        //2: 从List中创建
        Dataset dogListDf = spark.createDataFrame(list, Dog.class);
        dogListDf.printSchema();
        dogListDf.show();

        //3:从JavaRDD与schema中创建
        JavaRDD peopleRDD = spark.sparkContext()
                .textFile(Utils.BASE_PATH +"/people.txt", 1)
                .toJavaRDD();

        JavaRDD rowRDD = peopleRDD.map(new Function() {
            @Override
            public Row call(String record) throws Exception {
                String[] attributes = record.split(",");
                return RowFactory.create(attributes[0], attributes[1].trim());
            }
        });

        String schemaString = "name age";
        // Generate the schema based on the string of schema
        List fields = new ArrayList<>();
        for (String fieldName : schemaString.split(" ")) {
            StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
            fields.add(field);
        }
        StructType schema = DataTypes.createStructType(fields);
        // Apply the schema to the RDD
        Dataset peopleDataFrame = spark.createDataFrame(rowRDD, schema);
        peopleDataFrame.printSchema();
        peopleDataFrame.show();

        //4: 从List与schema中创建
        List listRows = rowRDD.collect();
        Dataset peopleListRowDataFrame = spark.createDataFrame(listRows, schema);
        peopleListRowDataFrame.show();

        //5: 从外部数据源中创建
        Dataset personDataset =
                spark.read().json(Utils.BASE_PATH +"/IoT_device_info.json");
        personDataset.show();

        spark.stop();
    }
}

 

你可能感兴趣的:(#,spark)