1.groupByKey、mapGroups、flatMapGroups结合使用
package com.DataSet;
import bean.Dept;
import bean.Employee;
import org.apache.spark.sql.*;
import java.util.ArrayList;
import java.util.List;
public class DataSetConvert {
private static SparkSession spark = SparkSession.builder().master("local[*]").appName("handle data").getOrCreate();
public static void main(String[] args) {
spark.conf().set("spark.sql.crossJoin.enabled", "true");
spark.sparkContext().setLogLevel("WARN");
Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
String path = "spark-hello/src/main/resources/employees.json";
Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
Dataset<Employee> out = flatMapGroups(groupByKey(ds));
out.show();
Dataset<Dept> out2 = mapGroups(groupByKey(ds));
out2.show();
}
public static KeyValueGroupedDataset<String, Employee> groupByKey(Dataset<Employee> ds) {
return ds.groupByKey(e -> e.getName(), Encoders.STRING());
}
public static Dataset<Dept> mapGroups(KeyValueGroupedDataset<String, Employee> kvgDS) {
Dataset<Dept> out = kvgDS.mapGroups((key, eList) -> {
Dept dept = new Dept();
eList.forEachRemaining(e -> {
dept.addEmployee(e);
});
return dept;
}, Encoders.bean(Dept.class));
return out;
}
public static Dataset<Employee> flatMapGroups(KeyValueGroupedDataset<String, Employee> kvgDS) {
Dataset<Employee> out = kvgDS.flatMapGroups((key, eList) -> {
List<Employee> employees = new ArrayList<>();
eList.forEachRemaining(e -> {
employees.add(e);
});
return employees.iterator();
}, Encoders.bean(Employee.class));
return out;
}
}