本文章主要通过代码实现spark读取各类数据源
1 spark读取hive数据
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.hive.HiveContext; /** * Hive数据源 * @author Administrator * */ public class HiveDataSource { @SuppressWarnings("deprecation") public static void main(String[] args) { // 首先还是创建SparkConf SparkConf conf = new SparkConf() .setAppName("HiveDataSource"); // 创建JavaSparkContext JavaSparkContext sc = new JavaSparkContext(conf); // 创建HiveContext,注意,这里,它接收的是SparkContext作为参数,不是JavaSparkContext HiveContext hiveContext = new HiveContext(sc.sc()); // 第一个功能,使用HiveContext的sql()方法,可以执行Hive中能够执行的HiveQL语句 // 判断是否存在student_infos表,如果存在则删除 hiveContext.sql("DROP TABLE IF EXISTS student_infos"); // 判断student_infos表是否不存在,如果不存在,则创建该表 hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT)"); // 将学生基本信息数据导入student_infos表 hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/usr/local/spark-study/resources/student_infos.txt' " + "INTO TABLE student_infos"); // 用同样的方式给student_scores导入数据 hiveContext.sql("DROP TABLE IF EXISTS student_scores"); hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT)"); hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/usr/local/spark-study/resources/student_scores.txt' " + "INTO TABLE student_scores"); // 第二个功能,执行sql还可以返回DataFrame,用于查询 // 执行sql查询,关联两张表,查询成绩大于80分的学生 DataFrame goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score " + "FROM student_infos si " + "JOIN student_scores ss ON si.name=ss.name " + "WHERE ss.score>=80"); // 第三个功能,可以将DataFrame中的数据,理论上来说,DataFrame对应的RDD的元素,是Row即可 // 将DataFrame中的数据保存到hive表中 // 接着将DataFrame中的数据保存到good_student_infos表中 hiveContext.sql("DROP TABLE IF EXISTS good_student_infos"); goodStudentsDF.saveAsTable("good_student_infos"); // 第四个功能,可以用table()方法,针对hive表,直接创建DataFrame // 然后针对good_student_infos表,直接创建DataFrame Row[] goodStudentRows = hiveContext.table("good_student_infos").collect(); for(Row goodStudentRow : goodStudentRows) { System.out.println(goodStudentRow); } sc.close(); } }
2 spark读取jdbc数据源
import java.sql.Connection; import java.sql.DriverManager; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Tuple2; /** * JDBC数据源 * @author Administrator * */ public class JDBCDataSource { public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("JDBCDataSource"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 总结一下 // jdbc数据源 // 首先,是通过SQLContext的read系列方法,将mysql中的数据加载为DataFrame // 然后可以将DataFrame转换为RDD,使用Spark Core提供的各种算子进行操作 // 最后可以将得到的数据结果,通过foreach()算子,写入mysql、hbase、redis等等db / cache中 // 分别将mysql中两张表的数据加载为DataFrame Map, String> options = new HashMap , String>(); options.put("url", "jdbc:mysql://spark1:3306/testdb"); options.put("dbtable", "student_infos"); DataFrame studentInfosDF = sqlContext.read().format("jdbc") .options(options).load(); options.put("dbtable", "student_scores"); DataFrame studentScoresDF = sqlContext.read().format("jdbc") .options(options).load(); // 将两个DataFrame转换为JavaPairRDD,执行join操作 JavaPairRDD , Tuple2 , Integer>> studentsRDD = studentInfosDF.javaRDD().mapToPair( new PairFunction , String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2
, Integer> call(Row row) throws Exception { return new Tuple2 , Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.get(1)))); } }) .join(studentScoresDF.javaRDD().mapToPair( new PairFunction , String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2
, Integer> call(Row row) throws Exception { return new Tuple2 , Integer>(String.valueOf(row.get(0)), Integer.valueOf(String.valueOf(row.get(1)))); } })); // 将JavaPairRDD转换为JavaRDD JavaRDD
studentRowsRDD = studentsRDD.map( new Function
,Tuple2 ,Integer>>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call( Tuple2 , Tuple2 , Integer>> tuple) throws Exception { return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2); } }); // 过滤出分数大于80分的数据 JavaRDD filteredStudentRowsRDD = studentRowsRDD.filter( new Function
, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(Row row) throws Exception { if(row.getInt(2) > 80) { return true; } return false; } }); // 转换为DataFrame List
structFields = new ArrayList (); structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); StructType structType = DataTypes.createStructType(structFields); DataFrame studentsDF = sqlContext.createDataFrame(filteredStudentRowsRDD, structType); Row[] rows = studentsDF.collect(); for(Row row : rows) { System.out.println(row); } // 将DataFrame中的数据保存到mysql表中 // 这种方式是在企业里很常用的,有可能是插入mysql、有可能是插入hbase,还有可能是插入redis缓存 studentsDF.javaRDD().foreach(new VoidFunction () { private static final long serialVersionUID = 1L; @Override public void call(Row row) throws Exception { String sql = "insert into good_student_infos values(" + "'" + String.valueOf(row.getString(0)) + "'," + Integer.valueOf(String.valueOf(row.get(1))) + "," + Integer.valueOf(String.valueOf(row.get(2))) + ")"; Class.forName("com.mysql.jdbc.Driver"); Connection conn = null; Statement stmt = null; try { conn = DriverManager.getConnection( "jdbc:mysql://spark1:3306/testdb", "", ""); stmt = conn.createStatement(); stmt.executeUpdate(sql); } catch (Exception e) { e.printStackTrace(); } finally { if(stmt != null) { stmt.close(); } if(conn != null) { conn.close(); } } } }); sc.close(); } }
3 spark读取json格式数据
import java.util.ArrayList; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Tuple2; /** * JSON数据源 * @author Administrator * */ public class JSONDataSource { public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("JSONDataSource"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 针对json文件,创建DataFrame(针对json文件创建DataFrame) DataFrame studentScoresDF = sqlContext.read().json( "hdfs://spark1:9000/spark-study/students.json"); // 针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分的学生的姓名 // (注册临时表,针对临时表执行sql语句) studentScoresDF.registerTempTable("student_scores"); DataFrame goodStudentScoresDF = sqlContext.sql( "select name,score from student_scores where score>=80"); // (将DataFrame转换为rdd,执行transformation操作) ListgoodStudentNames = goodStudentScoresDF.javaRDD().map( new Function , String>() { private static final long serialVersionUID = 1L; @Override public String call(Row row) throws Exception { return row.getString(0); } }).collect(); // 然后针对JavaRDD
,创建DataFrame // (针对包含json串的JavaRDD,创建DataFrame) ListstudentInfoJSONs = new ArrayList (); studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}"); studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}"); studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}"); JavaRDD studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs); DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD); // 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息 studentInfosDF.registerTempTable("student_infos"); String sql = "select name,age from student_infos where name in ("; for(int i = 0; i < goodStudentNames.size(); i++) { sql += "'" + goodStudentNames.get(i) + "'"; if(i < goodStudentNames.size() - 1) { sql += ","; } } sql += ")"; DataFrame goodStudentInfosDF = sqlContext.sql(sql); // 然后将两份数据的DataFrame,转换为JavaPairRDD,执行join transformation // (将DataFrame转换为JavaRDD,再map为JavaPairRDD,然后进行join) JavaPairRDD , Tuple2 , Integer>> goodStudentsRDD = goodStudentScoresDF.javaRDD().mapToPair(new PairFunction , String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2
, Integer> call(Row row) throws Exception { return new Tuple2 , Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); } }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction , String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2
, Integer> call(Row row) throws Exception { return new Tuple2 , Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); } })); // 然后将封装在RDD中的好学生的全部信息,转换为一个JavaRDD 的格式
// (将JavaRDD,转换为DataFrame) JavaRDDgoodStudentRowsRDD = goodStudentsRDD.map( new Function
,Tuple2 ,Integer>>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call( Tuple2 , Tuple2 , Integer>> tuple) throws Exception { return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2); } }); // 创建一份元数据,将JavaRDD 转换为DataFrame
ListstructFields = new ArrayList (); structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); StructType structType = DataTypes.createStructType(structFields); DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType); // 将好学生的全部信息保存到一个json文件中去 // (将DataFrame中的数据保存到外部的json文件中去) goodStudentsDF.write().format("json").save("hdfs://spark1:9000/spark-study/good-students"); } }
4 spark读取parquet数据
import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; /** * Parquet数据源之使用编程方式加载数据 * @author Administrator * */ public class ParquetLoadData { public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("ParquetLoadData"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 读取Parquet文件中的数据,创建一个DataFrame DataFrame usersDF = sqlContext.read().parquet( "hdfs://spark1:9000/spark-study/users.parquet"); // 将DataFrame注册为临时表,然后使用SQL查询需要的数据 usersDF.registerTempTable("users"); DataFrame userNamesDF = sqlContext.sql("select name from users"); // 对查询出来的DataFrame进行transformation操作,处理数据,然后打印出来 ListuserNames = userNamesDF.javaRDD().map(new Function , String>() { private static final long serialVersionUID = 1L; @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }).collect(); for(String userName : userNames) { System.out.println(userName); } } }