SparkSQL读取和保存的文件一般为三种,JSON文件、CSV文件和列式存储的文件,同时可以通过添加参数,来识别不同的存储和压缩格式。
1)代码实现
package com.atguigu.sparksql;
import com.atguigu.sparksql.Bean.User;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
public class Test06_CSV {
public static void main(String[] args) throws ClassNotFoundException {
//1. 创建配置对象
SparkConf conf = new SparkConf().setAppName("sparksql").setMaster("local[*]");
//2. 获取sparkSession
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
//3. 编写代码
DataFrameReader reader = spark.read();
// 添加参数 读取csv
Dataset
.option("header", "true")//默认为false 不读取列名
.option("sep",",") // 默认为, 列的分割
// 不需要写压缩格式 自适应
.csv("input/user.csv");
userDS.show();
// 转换为user的ds
// 直接转换类型会报错 csv读取的数据都是string
// Dataset
userDS.printSchema();
Dataset
@Override
public User call(Row value) throws Exception {
return new User(Long.valueOf(value.getString(0)), value.getString(1));
}
}, Encoders.bean(User.class));
userDS1.show();
// 写出为csv文件
DataFrameWriter
writer.option("header",";")
.option("header","true")
// .option("compression","gzip")// 压缩格式
// 写出模式
// append 追加
// Ignore 忽略本次写出
// Overwrite 覆盖写
// ErrorIfExists 如果存在报错
.mode(SaveMode.Append)
.csv("output");
//4. 关闭sparkSession
spark.close();
}
}
package com.atguigu.sparksql;
import com.atguigu.sparksql.Bean.User;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
public class Test07_JSON {
public static void main(String[] args) {
//1. 创建配置对象
SparkConf conf = new SparkConf().setAppName("sparksql").setMaster("local[*]");
//2. 获取sparkSession
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
//3. 编写代码
Dataset
// json数据可以读取数据的数据类型
Dataset
userDS.show();
// 读取别的类型的数据也能写出为json
DataFrameWriter
writer.json("output1");
//4. 关闭sparkSession
spark.close();
}
}
列式存储的数据自带列分割。
package com.atguigu.sparksql;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class Test08_Parquet {
public static void main(String[] args) {
//1. 创建配置对象
SparkConf conf = new SparkConf().setAppName("sparksql").setMaster("local[*]");
//2. 获取sparkSession
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
//3. 编写代码
Dataset
// 写出默认使用snappy压缩
// json.write().parquet("output");
// 读取parquet 自带解析 能够识别列名
Dataset
parquet.printSchema();
//4. 关闭sparkSession
spark.close();
}
}
1)导入依赖
2)从MySQL读数据
package com.atguigu.sparksql;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Properties;
public class Test09_Table {
public static void main(String[] args) {
//1. 创建配置对象
SparkConf conf = new SparkConf().setAppName("sparksql").setMaster("local[*]");
//2. 获取sparkSession
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
//3. 编写代码
Dataset
// 添加参数
Properties properties = new Properties();
properties.setProperty("user","root");
properties.setProperty("password","000000");
// json.write()
// // 写出模式针对于表格追加覆盖
// .mode(SaveMode.Append)
// .jdbc("jdbc:mysql://hadoop102:3306","gmall.testInfo",properties);
Dataset
jdbc.show();
//4. 关闭sparkSession
spark.close();
}
}
SparkSQL可以采用内嵌Hive(spark开箱即用的hive),也可以采用外部Hive。企业开发中,通常采用外部Hive。
1)添加MySQL连接驱动到spark-yarn的jars目录
[atguigu@hadoop102 spark-yarn]$ cp /opt/software/mysql-connector-java-5.1.27-bin.jar /opt/module/spark-yarn/jars
2)添加hive-site.xml文件到spark-yarn的conf目录
[atguigu@hadoop102 spark-yarn]$ cp /opt/module/hive/conf/hive-site.xml /opt/module/spark-yarn/conf
3)启动spark-sql的客户端即可
[atguigu@hadoop102 spark-yarn]$ bin/spark-sql --master yarn
spark-sql (default)> show tables;
1)添加依赖
2)拷贝hive-site.xml到resources目录(如果需要操作Hadoop,需要拷贝hdfs-site.xml、core-site.xml、yarn-site.xml)
3)代码实现
package com.atguigu.sparksql;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
public class Test10_Hive {
public static void main(String[] args) {
System.setProperty("HADOOP_USER_NAME","atguigu");
//1. 创建配置对象
SparkConf conf = new SparkConf().setAppName("sparksql").setMaster("local[*]");
//2. 获取sparkSession
SparkSession spark = SparkSession.builder()
.enableHiveSupport()// 添加hive支持
.config(conf).getOrCreate();
//3. 编写代码
spark.sql("show tables").show();
spark.sql("create table user_info(name String,age bigint)");
spark.sql("insert into table user_info values('zhangsan',10)");
spark.sql("select * from user_info").show();
//4. 关闭sparkSession
spark.close();
}
}