Spark SQL重要是操作DataFrame,DataFrame本身提供了save和load的操作,
Load:可以创建DataFrame,
Save:把DataFrame中的数据保存到文件或者说与具体的格式来指明我们要读取的文件的类型以及与具体的格式来指出我们要输出的文件是什么类型。
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.List;
/**
* Spark SQL读写操作
* @DT_大数据梦工厂
*/
public class SparkSQLLoadSaveOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster.setAppName("SparkSQLLoadSaveOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext= new SQLContext(sc);
/**
* read()是DataFrameReader类型,load可以将数据读取出来
*/
DataFrame peopleDF = sqlContext.read().format("json").load("HDFS://master:9000/user/people.json");
/**
* 直接对DataFrame进行操作
* Json: 是一种自解释的格式,读取Json的时候怎么判断其是什么格式?
* 通过扫描整个Json。扫描之后才会知道元数据
*/
//通过mode来指定输出文件的是append。创建新文件来追加文件
peopleDF.select("name").write().mode(SaveMode.Append).save("/user/personNames");
}
}
注意:
调试过程:如果在window环境下出现
ERROR Shell: Failed to locate the winutils binary in the hadoop binary path
java.io.IOException: Could not locate executable null\bin\winutils.exe in the Hadoop binaries.
需要在hadoop/bin目录下添加相关组件,参考:
http://www.srccodes.com/p/article/39/error-util-shell-failed-locate-winutils-binary-hadoop-binary-path
1、read方法返回DataFrameReader,用于读取数据,在SQLContext.scala中:
/**
* :: Experimental ::
* Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]].
* {{{
* sqlContext.read.parquet("/path/to/file.parquet")
* sqlContext.read.schema(schema).json("/path/to/file.json")
* }}}
*
* @group genericdata
* @since 1.4.0
*/
@Experimental
//创建DataFrameReader实例,获得了DataFrameReader引用
def read: DataFrameReader = new DataFrameReader(this)
2、 然后再调用DataFrameReader类中的format,指出读取文件的格式
/**
* Specifies the input data source format.
*
* @since 1.4.0
*/
def format(source: String): DataFrameReader = {
this.source = source
this
}
3、通过DataFrameReader中load方法通过路径把传入过来的输入变成DataFrame
/**
* Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
* a local or distributed file system).
*
* @since 1.4.0
*/
// TODO: Remove this one in Spark 2.0.
def load(path: String): DataFrame = {
option("path", path).load()
}
至此,数据的读取工作就完成了,下面就对DataFrame进行操作。
下面就是写操作。
1、调用DataFrame中select函数进行队列筛选
/**
* Selects a set of columns. This is a variant of `select` that can only select
* existing columns using column names (i.e. cannot construct expressions).
*
* {{{
* // The following two are equivalent:
* df.select("colA", "colB")
* df.select($"colA", $"colB")
* }}}
* @group dfops
* @since 1.3.0
*/
@scala.annotation.varargs
def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) : _*)
/**
* :: Experimental ::
* Interface for saving the content of the [[DataFrame]] out into external storage.
*
* @group output
* @since 1.4.0
*/
@Experimental
def write: DataFrameWriter = new DataFrameWriter(this)
/**
* Specifies the behavior when data or table already exists. Options include:
* Overwrite是覆盖
* - `SaveMode.Overwrite`: overwrite the existing data.
* //创建新的文件然后追加
* - `SaveMode.Append`: append the data.
* - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
* - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
*
* @since 1.4.0
*/
def mode(saveMode: SaveMode): DataFrameWriter = {
this.mode = saveMode
this
}
/**
* Saves the content of the [[DataFrame]] at the specified path.
*
* @since 1.4.0
*/
def save(path: String): Unit = {
this.extraOptions += ("path" -> path)
save()
}
SQL读写流程
/**
* Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
* a local or distributed file system).
*
* @since 1.4.0
*/
// TODO: Remove this one in Spark 2.0.
def load(path: String): DataFrame = {
//Option的返回值就是DataFrameReader
option("path", path).load()
}
追踪load源码进去,源码如下:
2. 在DataFrameReader中的方法。Load()通过路径把输入传进来变成一个DataFrame。
/**
* Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external
* key-value stores).
*
* @since 1.4.0
*/
def load(): DataFrame = {
//对传入的source进行解析
val resolved = ResolvedDataSource(
sqlContext,
userSpecifiedSchema = userSpecifiedSchema,
partitionColumns = Array.empty[String],
provider = source,
options = extraOptions.toMap)
DataFrame(sqlContext, LogicalRelation(resolved.relation))
}
/**
* Specifies the input data source format.
*
* @since 1.4.0
*/
def format(source: String): DataFrameReader = {
this.source = source //FileType
this
}
/**
* :: Experimental ::
* Interface for saving the content of the [[DataFrame]] out into external storage.
*
* @group output
* @since 1.4.0
*/
@Experimental
def write: DataFrameWriter = new DataFrameWriter(this)
/**
* :: Experimental ::
* Interface used to write a [[DataFrame]] to external storage systems (e.g. file systems,
* key-value stores, etc). Use [[DataFrame.write]] to access this.
*
* @since 1.4.0
*/
@Experimental
final class DataFrameWriter private[sql](df: DataFrame) {
/**
* Specifies the behavior when data or table already exists. Options include:
* - `SaveMode.Overwrite`: overwrite the existing data.
* - `SaveMode.Append`: append the data.
* - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
//默认操作
* - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
*
* @since 1.4.0
*/
def mode(saveMode: SaveMode): DataFrameWriter = {
this.mode = saveMode
this
}
/**
* Specifies the behavior when data or table already exists. Options include:
* - `overwrite`: overwrite the existing data.
* - `append`: append the data.
* - `ignore`: ignore the operation (i.e. no-op).
* - `error`: default option, throw an exception at runtime.
*
* @since 1.4.0
*/
def mode(saveMode: String): DataFrameWriter = {
this.mode = saveMode.toLowerCase match {
case "overwrite" => SaveMode.Overwrite
case "append" => SaveMode.Append
case "ignore" => SaveMode.Ignore
case "error" | "default" => SaveMode.ErrorIfExists
case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
"Accepted modes are 'overwrite', 'append', 'ignore', 'error'.")
}
this
}
/**
* Saves the content of the [[DataFrame]] at the specified path.
*
* @since 1.4.0
*/
def save(path: String): Unit = {
this.extraOptions += ("path" -> path)
save()
}
/**
* Saves the content of the [[DataFrame]] as the specified table.
*
* @since 1.4.0
*/
def save(): Unit = {
ResolvedDataSource(
df.sqlContext,
source,
partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
mode,
extraOptions.toMap,
df)
}
private var source: String = df.sqlContext.conf.defaultDataSourceName
其中DEFAULT_DATA_SOURCE_NAME默认参数是parquet。
// This is used to set the default data source
val DEFAULT_DATA_SOURCE_NAME = stringConf("spark.sql.sources.default",
defaultValue = Some("org.apache.spark.sql.parquet"),
doc = "The default data source to use in input/output.")
/**
* Returns the object itself.
* @group basic
* @since 1.3.0
*/
// This is declared with parentheses to prevent the Scala compiler from treating
// `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
def toDF(): DataFrame = this
/**
* Displays the [[DataFrame]] in a tabular form. For example:
* {{{
* year month AVG('Adj Close) MAX('Adj Close)
* 1980 12 0.503218 0.595103
* 1981 01 0.523289 0.570307
* 1982 02 0.436504 0.475256
* 1983 03 0.410516 0.442194
* 1984 04 0.450090 0.483521
* }}}
* @param numRows Number of rows to show
* @param truncate Whether truncate long strings. If true, strings more than 20 characters will
* be truncated and all cells will be aligned right
*
* @group action
* @since 1.5.0
*/
// scalastyle:off println
def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate))
// scalastyle:on println
追踪showString源码如下:showString中触发action收集数据。
/**
* Compose the string representing rows for output
* @param _numRows Number of rows to show
* @param truncate Whether truncate long strings and align cells right
*/
private[sql] def showString(_numRows: Int, truncate: Boolean = true): String = {
val numRows = _numRows.max(0)
val sb = new StringBuilder
val takeResult = take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
val numCols = schema.fieldNames.length
1、过去整个业界对大数据的分析的技术栈的Pipeline一般分为以下两种方式:
a)Data Source -> HDFS -> MR/Hive/Spark(相当于ETL)-> HDFS Parquet -> Spark SQL/Impala -> ResultService(可以放在DB中,也有可能被通过JDBC/ODBC来作为数据服务使用);
b)Data Source -> Real timeupdate data to HBase/DB -> Export to Parquet -> Spark SQL/Impala -> ResultService(可以放在DB中,也有可能被通过JDBC/ODBC来作为数据服务使用);
上述的第二种方式完全可以通过Kafka+Spark Streaming+Spark SQL(内部也强烈建议采用Parquet的方式来存储数据)的方式取代
2、期待的方式:
DataSource -> Kafka -> Spark Streaming -> Parquet -> Spark SQL(ML、GraphX等)-> Parquet -> 其它各种Data Mining等。
Java版本
package com.dt.sparksql;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class SparkSQLParquetOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkSQLParquetOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//读取数据
DataFrame usersDF = sqlContext.read().parquet("G:/users.parquet");
//注册成为临时表以供后续的SQL查询操作
usersDF.registerTempTable("users");
//进行数据的多维度分析
DataFrame result = sqlContext.sql("select * from users");
//对结果进行处理,包括由DataFrame转换成为RDD
,以及结构持久化。
List<Row> listRow = result.javaRDD().collect();
for(Row row : listRow){
System.out.println(row);
}
}
}
另一个版本
package com.dt.spark.SparkApps.sql;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class SparkSQLParquetOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkSQLParquetOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame usersDF = sqlContext.read().parquet("E:\\users.parquet");
/**
* 注册成为临时表以供后续的SQL查询操作
*/
usersDF.registerTempTable("users");
/**
* 进行数据的多维度分析
*/
DataFrame result = sqlContext.sql("select * from users");
JavaRDD<String> resultRDD = result.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return "The name is : " + row.getAs("name");
}
});
/**
* 第六步:对结果进行处理,包括由DataFrame转换成为RDD
,以及结构持久化
*/
List<String> listRow = resultRDD.collect();
for(String row : listRow){
System.out.println(row);
}
}
}
parquet文件操作之scala版本:
val sqlContext = new SQLContext(sc)
val usersDF = sqlContext.read.parquet("/user/users.parquet")
usersDF.show()
usersDF.registerTempTable("users")
val result = sqlContext.sql("select * from users")
result.rdd.collect.foreach(println)
结果:
[Alyssa,null,ArrayBuffer(3, 9, 15, 20)]
[Ben,red,ArrayBuffer()]
1, 如果说HDFS是大数据时代分布式文件系统存储的事实标准的话,Parquet则是整个大数据时代文件存储格式的事实标准。
2, 速度更快:从使用Spark SQL 操作普通文件CSV和Parquet文件的速度对比上来看,绝大多数情况下使用Parquet会比使用CSV等普通文件速度提升10倍左右(在一些普通文件系统无法再Spark上成功运行程序的情况下,使用Parquet很多时候都可以成功运行)
3, Parquet的压缩技术非常稳定出色,在Spark SQL中对压缩技术的处理可能无法正常的完成工作(例如会导致Lost Task、Lost Executor),但是此时如果使用Parquet就可以正常的完成;
4, 极大的减少磁盘I/O,通常情况下能够减少75%的存储空间,由此可以极大的减少Spark SQL处理数据的时候的数据输入内容,尤其是在Spark 1.6.x中下推过滤器在一些情况下可以极大的进一步减少磁盘的IO和内存的占用;
5, Spark 1.6.x+Parquet极大的提升了数据扫描的吞吐量,这极大的提高了数据的查找速度,Spark 1.6和Spark 1.5.x相比较而言提升了大约1倍的速度,在Spark 1.6.x中操作Parquet时候CPU的使用也进行了极大的优化,有效的降低了CPU的使用;
6, 采用Parquet可以极大的优化Spark的调度和执行,我们测试表面Spark如果采用Parquet可以有效的减少Stage的执行消耗,同时可以优化执行路径;
1, 列式存储是是以什么基本格式来存储数据的?表现上是树状数据结构,在内部有元数据的Table;
2, 在具体的Parquet文件存储的时候有三个核心组成部分:
a) Storage Format:Parquet定义了具体的数据内部的类型和存储格式;
b) Object Model Converters:Parquet中负责计算框架中数据对象和Parquet文件中具体数据类型的映射;
c) Object Models:在Parquet中具有自己的Object Model定义的存储格式,例如说Avro具有自己的Object Model,但是Parquet在处理相关的格式的数据的时候使用自己的Object Model来存储;
映射完成后Parquet会进行自己的Column Encoding,然后存储成为Parquet格式的文件。
3, Modules
The parquet-format project contains format specifications and Thrift definitions of metadata required to properly read Parquet files.
The parquet-mr project contains multiple sub-modules, which implement the core components of reading and writing a nested, column-oriented data stream, map this core onto the parquet format, and provide Hadoop Input/Output Formats, Pig loaders, and other java-based utilities for interacting with Parquet.
The parquet-compatibility project contains compatibility tests that can be used to verify that implementations in different languages can read and write each other’s files.
4, 举例说明:
messageAddressBook {
required string owner;
repeated string ownerPhoneNumbers;
repeated group contacts {
required string name;
optional string phoneNumber;
}
}
required(出现1次),optional(出现0次或者1次),repeated(出现0次或者多次)
这个schema中每条记录表示一个人的AddressBook。有且只有一个owner,owner可以有0个或者多个ownerPhoneNumbers,owner可以有0个或者多个contacts。每个contact有且只有一个name,这个contact的phoneNumber可有可无。
第一点:就存储数据本身而言,只考虑叶子节点,我们的叶子节点owner、ownerPhoneNumber、name、phoneNumber;
第二点:在逻辑上而言Schema实质上是一个Table:
AddressBook
owner ownerPhoneNumber Contacts
Name phoneNumber
第三点:对于一个Parquet文件而言,数据会被分成Row Group(里面包含很多Column,每个Column具有几个非常重要的特性例如Repetition Level、Definition Level);
第四点:Column在Parquet中是以Page的方式存在的,Page中有Repetition Level、Definition Level等内容;
第五点:Row Group在Parquet中是数据读写的缓存单元,所以对Row Group的设置会极大的影响Parquet的使用速度和效率,所以如果是分析日志的话,我们一般建议把Row Group的缓存大小配置成大约256MB,很多人的配置都是大约1G,如果想最大化的运行效率强烈建议HDFS的Block大小和Row Group一直
record shredding and assembly algorithm
第六点:在实际存储的把一个树状结构,通过巧妙的编码算法,转换成二维表结构
Repetiton Level Definition Level Value
1 2 18610086859
0 1 “Spark”
0 0 NULL
参考parquet官网:
http://parquet.apache.org/documentation/latest/
Parquet中关键概念:
1 Block
2 File
3 Column Chunk
数据按列存储时,每一列的数据被分割成多个列块。存储逻辑视图:RowGroup。每一列的列簇组合起来就叫RowGroup。RowGroup是Page方式存储的.
Page的概念:从逻辑抽象上来看,每一个列块被分割成Page,Page是压缩和编码的最小单位。一个Parquet文件中最少包含一个RowGroup。每个RowGroup一般情况下会包含多个列块,即column chunk。每个列仅仅对应一个列块,每个列块包含一个或多个Page。从结果上讲,可以认为一个数据块就是一个Block等同于一个RowGroup。这和HDFS的数据块完全不同。
Parquet包含元数据,元数据有几种类型:
1 文件的元数据
2 RowGroup的元数据
3 列块的元数据
Parquet的序列化和反序列化占用读写Parquet文件时间的60--80%。
参考 http://blog.csdn.net/slq1023/article/details/51051592
从使用接口的角度来看Spark SQL Parquet图:
Parquet源码流程图
见 http://blog.csdn.net/slq1023/article/details/51051522
Like ProtocolBuffer, Avro, and Thrift, Parquet also supports schema evolution. Users can start with a simple schema, and gradually add more columns to the schema as needed. In this way, users may end up with multiple Parquet files with different but mutually compatible schemas. The Parquet data source is now able to automatically detect this case and merge schemas of all these files.
Since schema merging is a relatively expensive operation, and is not a necessity in most cases, we turned it off by default starting from 1.5.0. You may enable it by
1. setting data source option mergeSchema to true when reading Parquet files (as shown in the examples below), or
2. setting the global SQL option spark.sql.parquet.mergeSchema to true.
// This is used to implicitly convert an RDD to a DataFrame.
import spark.implicits._
// Create a simple DataFrame, store into a partition directory
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
squaresDF.write.parquet("data/test_table/key=1")
// Create another DataFrame in a new partition directory,
// adding a new column and dropping an existing column
val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube")
cubesDF.write.parquet("data/test_table/key=2")
// Read the partitioned table
val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
mergedDF.printSchema()
// The final schema consists of all 3 columns in the Parquet files together
// with the partitioning column appeared in the partition directory paths
// root
// |-- value: int (nullable = true)
// |-- square: int (nullable = true)
// |-- cube: int (nullable = true)
// |-- key : int (nullable = true)
参考官网:
http://spark.apache.org/docs/latest/sql-programming-guide.html
实例中使用了df.write方法将DataFrame数据以parquet格式写入到HDFS上。
下面从源码的角度解读此实例:
在DataFrame.scala类中,可以找到write方法:
/**
* :: Experimental ::
* Interface for saving the content of the [[DataFrame]] out into external storage.
*
* @group output
* @since 1.4.0
*/
@Experimental
def write: DataFrameWriter = new DataFrameWriter(this)
可以看出,DataFrame的write方法直接生成了一个DataFrameWriter实例。
在DataFrameWriter类中可以找到parquet方法:
/**
* Saves the content of the [[DataFrame]] in Parquet format at the specified path.
* This is equivalent to:
* {{{
* format("parquet").save(path)
* }}}
*
* @since 1.4.0
*/
def parquet(path: String): Unit = format("parquet").save(path)
可以看出parquet方法只是format("parquet").save(path)方法的快捷方式。
format方法的源码如下:
/**
* Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
*
* @since 1.4.0
*/
def format(source: String): DataFrameWriter = {
this.source = source
this
}
format方法只是返回“parquet”格式名称本身,然后进行save操作。
/**
* Saves the content of the [[DataFrame]] at the specified path.
*
* @since 1.4.0
*/
def save(path: String): Unit = {
this.extraOptions += ("path" -> path)
save()
}
可以看出save操作中调用了extraOptions方法:
private var extraOptions = new scala.collection.mutable.HashMap[String, String]
可以看出extraOptions 是一个HashMap。
save操作还调用了save()方法:
/**
* Saves the content of the [[DataFrame]] as the specified table.
*
* @since 1.4.0
*/
def save(): Unit = {
ResolvedDataSource(
df.sqlContext,
source,
partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
mode,
extraOptions.toMap,
df)
}
save()方法主要就是调用ResolvedDataSource的apply方法:
/** Create a [[ResolvedDataSource]] for saving the content of the given DataFrame. */
def apply(
sqlContext: SQLContext, //对应save()方法中的df.sqlContext。
provider: String, //对应save()方法中的source,即“parquet”格式名称
partitionColumns: Array[String],
mode: SaveMode,
options: Map[String, String],
data: DataFrame): ResolvedDataSource = {
if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
throw new AnalysisException("Cannot save interval data type into external storage.")
}
val clazz: Class[_] = lookupDataSource(provider)
val relation = clazz.newInstance() match {
case dataSource: CreatableRelationProvider =>
dataSource.createRelation(sqlContext, mode, options, data)
case dataSource: HadoopFsRelationProvider =>
// Don't glob path for the write path. The contracts here are:
// 1. Only one output path can be specified on the write path;
// 2. Output path must be a legal HDFS style file system path;
// 3. It's OK that the output path doesn't exist yet;
val caseInsensitiveOptions = new CaseInsensitiveMap(options)
val outputPath = {
val path = new Path(caseInsensitiveOptions("path"))
val fs = path.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
path.makeQualified(fs.getUri, fs.getWorkingDirectory)
}
val caseSensitive = sqlContext.conf.caseSensitiveAnalysis
PartitioningUtils.validatePartitionColumnDataTypes(
data.schema, partitionColumns, caseSensitive)
val equality = columnNameEquality(caseSensitive)
val dataSchema = StructType(
data.schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
val r = dataSource.createRelation(
sqlContext,
Array(outputPath.toString),
Some(dataSchema.asNullable),
Some(partitionColumnsSchema(data.schema, partitionColumns, caseSensitive)),
caseInsensitiveOptions)
// For partitioned relation r, r.schema's column ordering can be different from the column
// ordering of data.logicalPlan (partition columns are all moved after data column). This
// will be adjusted within InsertIntoHadoopFsRelation.
sqlContext.executePlan(
InsertIntoHadoopFsRelation(
r,
data.logicalPlan,
mode)).toRdd
r
case _ =>
sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
}
ResolvedDataSource(clazz, relation)
}
}
save()方法中的source的源码为:
private var source: String = df.sqlContext.conf.defaultDataSourceName
而SQLContext的conf中的defaultDataSourceName方法为:
private[spark] def defaultDataSourceName: String = getConf(DEFAULT_DATA_SOURCE_NAME)
在SQLConf.scala中可以看到:
// This is used to set the default data source
val DEFAULT_DATA_SOURCE_NAME = stringConf("spark.sql.sources.default",
defaultValue = Some("org.apache.spark.sql.parquet"),
doc = "The default data source to use in input/output.")
即默认数据源是parquet。
parquet.block.size基本上是压缩后的大小。读取数据时可能数据还在encoding,page内部有repetitionLevel DefinitionLevel data。
Java的二进制就是字节流
Parquet非常耗内存,采用高压缩比率,采用很多Cache。
解压后的大小是解压前的5-10倍。
BlockSize采用默认256MB。
Java版本:
package com.dt.sparksql;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class SchemaOps {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "G:/datarguru spark/tool/hadoop-2.6.0");
SparkConf conf = new SparkConf().setMaster("local").setAppName("SchemaOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// Create a simple DataFrame, stored into a partition directory
JavaRDD<Integer> lines = sc.parallelize(Arrays.asList(1,2,3,4,5));
PairFunction<Integer,Integer,Integer> df = new PairFunction<Integer,Integer,Integer>() {
@Override
public Tuple2 call(Integer x) throws Exception {
return new Tuple2(x,x * 2);
}
};
JavaPairRDD<Integer,Integer> pairs = lines.mapToPair(df);
/**
* 第一步:在RDD的基础上创建类型为Row的RDD
*/
JavaRDD<Row> personsRDD = pairs.map(new Function<Tuple2<Integer,Integer>, Row>() {
@Override
public Row call(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
return RowFactory.create(integerIntegerTuple2._1,integerIntegerTuple2._2);
}
});
/**
* 第二步:动态构造DataFrame的元数据,一般而言,有多少列,以及每列的具体类型可能来自于JSON文件
* 也可能来自于数据库。
* 指定类型
*/
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("single",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("Double", DataTypes.IntegerType,true));
/**
* 构建StructType用于最后DataFrame元数据的描述
*/
StructType structType = DataTypes.createStructType(structFields);
/**
* 第三步:基于以后的MetaData以及RDD
来构建DataFrame
*/
DataFrame personsDF = sqlContext.createDataFrame(personsRDD, structType);
personsDF.write().parquet("G:/data/test_table/key=1");
// Create a simple DataFrame, stored into a partition directory
JavaRDD<Integer> lines1 = sc.parallelize(Arrays.asList(6,7,8,9,10));
PairFunction<Integer,Integer,Integer> df3 = new PairFunction<Integer,Integer,Integer>() {
@Override
public Tuple2 call(Integer x) throws Exception {
return new Tuple2(x,x * 2);
}
};
JavaPairRDD<Integer,Integer> pairs1 = lines.mapToPair(df2);
/**
* 第一步:在RDD的基础上创建类型为Row的RDD
*/
JavaRDD<Row> personsRDD1 = pairs1.map(new Function<Tuple2<Integer, Integer>, Row>() {
@Override
public Row call(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
return RowFactory.create(integerIntegerTuple2._1,integerIntegerTuple2._2);
}
});
/**
* 第二步:动态构造DataFrame的元数据,一般而言,有多少列,以及每列的具体类型可能来自于JSON文件
* 也可能来自于数据库。
* 指定类型
*/
List<StructField> structFields1 = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("single",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("triple",DataTypes.IntegerType,true));
/**
* 构建StructType用于最后DataFrame元数据的描述
*/
StructType structType1 = DataTypes.createStructType(structFields);
/**
* 第三步:基于以后的MetaData以及RDD
来构建DataFrame
*/
DataFrame personsDF1 = sqlContext.createDataFrame(personsRDD1,structType1);
personsDF1.write().parquet("G:/data/test_table/key=2");
DataFrame df4 = sqlContext.read().option("mergeSchema","true").parquet("data/test_table");
df4.printSchema();
}
}
Hive中也有PushDown。PushDown可以极大减少数据输入,极大的提高处理效率。
SparkSQL实现了PushDown,在Parquet文件中实现PushDown具有很重要的意义。
PushDown是一种SQL优化方式,通常用在查询。应用场景:
假设通过DataFrame,df.select(a,b,c).filter(by a).filter(by b).select(c).filter(by c)这样的查询,在optimizer阶段,需要合并多个filters(CombineFilters),并调整算子间的顺序,例如将部分filter移到select等前面(PushPredicateThroughAggregate/Generate/Join/Project)。filter前需要操作一大批数据,但filter后只需要操作很小一部分数据,SQL优化时就希望一开始就只操作这一小部分数据,而不需要把所有数据都导入进来,因为最终还是要被过滤掉。
PushDown本身既有SQL语法的层面也有物理执行的层面。
语法层面,SparkSQL和Hive都有自己的语法实现。
下面看一下QueryExecution的源码:
/**
* The primary workflow for executing relational queries using Spark. Designed to allow easy
* access to the intermediate phases of query execution for developers.
*
* While this is not a public class, we should avoid changing the function names for the sake of
* changing them, because a lot of developers use the feature for debugging.
*/
class QueryExecution(valsqlContext: SQLContext, vallogical: LogicalPlan) {
def assertAnalyzed(): Unit = sqlContext.analyzer.checkAnalysis(analyzed)
lazy val analyzed: LogicalPlan = sqlContext.analyzer.execute(logical)
lazy val withCachedData: LogicalPlan = {
assertAnalyzed()
sqlContext.cacheManager.useCachedData(analyzed)
}
lazy val optimizedPlan: LogicalPlan = sqlContext.optimizer.execute(withCachedData)
lazy val sparkPlan: SparkPlan = {
SQLContext.setActive(sqlContext)
sqlContext.planner.plan(optimizedPlan).next()
}
// executedPlan should not be used to initialize any SparkPlan. It should be
// only used for execution.
lazy valexecutedPlan: SparkPlan = sqlContext.prepareForExecution.execute(sparkPlan)
/** Internal version of the RDD. Avoids copies and has no schema */
lazy valtoRdd: RDD[InternalRow] =executedPlan.execute()
protected def stringOrError[A](f: =>A): String =
try f.toString catch { casee: Throwable => e.toString }
def simpleString: String = {
s"""== Physical Plan ==
|${stringOrError(executedPlan)}
""".stripMargin.trim
}
override def toString:String = {
def output =
analyzed.output.map(o =>s"${o.name}:${o.dataType.simpleString}").mkString(", ")
s"""== Parsed Logical Plan ==
|${stringOrError(logical)}
|== Analyzed Logical Plan ==
|${stringOrError(output)}
|${stringOrError(analyzed)}
|== Optimized Logical Plan ==
|${stringOrError(optimizedPlan)}
|== Physical Plan ==
|${stringOrError(executedPlan)}
""".stripMargin.trim
}
}
QueryExecution在具体实现时会把工作过程串联成一个WorkFlow。
SQL语句的翻译过程:
1 基本语法翻译
2 pharser翻译
3 优化
4 Logical plan
5 Physical执行计划
6 引擎上执行
例如DataFrame,df.select(a,b,c).filter(by a).filter(by b).select(c).filter(by c)这样一个SQL语句,在执行前会生成一个语法树,解析和优化,在优化阶段会把Filter合并,在合并时会考虑Filter的顺序。
下面再看一下spark.sql.catalyst的Optimizer的源码:
package org.apache.spark.sql.catalyst.optimizer
abstract class Optimizerextends RuleExecutor[LogicalPlan]
object DefaultOptimizer extends Optimizer {
val batches=
// SubQueries are only needed for analysis and can be removed before execution.
Batch("Remove SubQueries",FixedPoint(100),
EliminateSubQueries) ::
Batch("Aggregate",FixedPoint(100),
ReplaceDistinctWithAggregate,
RemoveLiteralFromGroupExpressions) ::
Batch("Operator Optimizations",FixedPoint(100),
// Operator push down
SetOperationPushDown,
SamplePushDown,
PushPredicateThroughJoin,
PushPredicateThroughProject,
PushPredicateThroughGenerate,
PushPredicateThroughAggregate,
ColumnPruning,
// Operator combine
ProjectCollapsing,
CombineFilters,
CombineLimits,
// Constant folding
NullPropagation,
OptimizeIn,
ConstantFolding,
LikeSimplification,
BooleanSimplification,
RemoveDispensableExpressions,
SimplifyFilters,
SimplifyCasts,
SimplifyCaseConversionExpressions) ::
Batch("Decimal Optimizations",FixedPoint(100),
DecimalAggregates) ::
Batch("LocalRelation",FixedPoint(100),
ConvertToLocalRelation) :: Nil
}
PushDown是要把操作放到叶子节点上。这也是为什么叫谓词下推(Predicate.pushdown)的原因。当把操作放到叶子节点时就导致操作在数据源上执行。
下面图示PushDown的过程:
PushDown过程
详细的SparkSQL + Parquet的pushdown实现见:
http://flykobe.com/index.php/2016/03/02/sparksql-parquet-pushdown/?utm_source=tuicool&utm_medium=referral
本课直接通过实战练习SparkSQL下的Join操作:
package com.dt.sparksql;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class SparkSQLWithJoin {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "G:/datarguru spark/tool/hadoop-2.6.0");
SparkConf conf = new SparkConf().setMaster("local").setAppName("SchemaOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//针对json文件数据源来穿件DataFrame
DataFrame peopleDF = sqlContext.read().json("G:/people.json");
//基于Json构建的DataFrame来注册临时表
peopleDF.registerTempTable("peopleScores");
//查询分数大于90的人
DataFrame excellentScoresDF = sqlContext.sql("select * from peopleScores where score > 90");
/*
* 在DataFrame的基础上转换成RDD,通过map操作计算分数大于90的所有人的姓名
*/
List<String> excellentScoreNameList = excellentScoresDF.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getAs("name");
}
}).collect();
//动态组拼出JSON
List<String> peopleInformations = new ArrayList<String>();
peopleInformations.add("{\"name\":\"Michael\", \"age\":20}");
peopleInformations.add("{\"name\":\"Andy\", \"age\":17}");
peopleInformations.add("{\"name\":\"Justin\", \"age\":19}");
//通过内容为Json的RDD来构造DataFrame
JavaRDD<String> peopleInformationsRDD = sc.parallelize(peopleInformations);
DataFrame peopleInformationsDF = sqlContext.read().json(peopleInformationsRDD);
//注册成为临时表
peopleInformationsDF.registerTempTable("peopleInformations");
String sqlText = "select name,age from peopleInformations where name in (";
for(int i =0; i< excellentScoreNameList.size(); i++){
sqlText += "'" + excellentScoreNameList.get(i) + "'";
if(i < excellentScoreNameList.size()-1){
sqlText += ",";
}
}
sqlText +=")";
DataFrame excellentNameAgeDF = sqlContext.sql(sqlText);
JavaPairRDD<String, Tuple2<Integer, Integer>> resultRDD = excellentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>((String) row.getAs("name"), (int)row.getLong(1));
}
}).join(excellentNameAgeDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>((String) row.getAs("name"), (int)row.getLong(1));
}
}));
JavaRDD<Row> resultRowRDD = resultRDD.map(new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception {
return RowFactory.create(tuple._1, tuple._2._2, tuple._2._1);
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
//构建StructType,用于最后DataFrame元数据的描述
StructType structType = DataTypes.createStructType(structFields);
DataFrame personsDF = sqlContext.createDataFrame(resultRowRDD, structType);
personsDF.show();
personsDF.write().format("json").save("G:/peopleResult");
}
}
SparkSQL可以通过JDBC从传统关系型数据库中读写数据,读取数据后直接生成的是DataFrame,然后再加上借助于Spark内核的丰富的API来进行各种操作。
不通过SparkSQL,直接通过RDD也可以操作MySQL。
package com.dt.sparksql;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class SparkSQLJDBC2Mysql {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "G:/datarguru spark/tool/hadoop-2.6.0");
SparkConf conf = new SparkConf().setMaster("local").setAppName("SchemaOps");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
/**
* 1.通过format(“jdbc”) 的方式说明sparksql操作的数据通过jdbc获得jdbc后端一般是数据库例如mysql 。oracle
* 2.通过DataFrameReader的option的方法把方位的数据库的信息传递进去
* url:代表数据库的jdbc链接地址
* datable 具体指哪个数据库
* 3. dirver 部分是是sparksql访问数据库的具体的驱动完整的包名和类名
* 4. 关于jdbc 的驱动jar,可以放在spark的lib 目录下,也可以在使用sparksubmit的使用指定的jar
* (编码和打包的时候都不需要这个JDBC的jar)
*/
DataFrameReader reader = sqlContext.read().format("jdbc");
reader.option("url", "jdbc:mysql://master:3306/spark");
reader.option("dbtable", "nameandscore");
reader.option("driver", "com.mysql.jdbc.Driver");
reader.option("user", "root");
reader.option("password", "123");
/**
* 在实际的企业级开发环境中我们如果数据库中数据规模特别大,例如10亿条数据,此时采用传统的db 去处理的话,一般需要对数据
* 分成很多批次处理例如分成100批(首受限于单台server的处理能力)且实际处理可能会非常复杂,通过传统的J2ee 等基石很难或者很不方便实现处理方法,此时
* 使用sparksql获得数数据库中的数据并进行分布式处理就可以非常好的解决该问题,但是sparksql 加载数据需要时间,所以一边会在sparksql和db 之间加一个缓冲层
* 例如中间使用redis,可以把spark的处理速度提高甚至45倍。
*/
DataFrame nameandscoremysqlDataSourceDF = reader.load();//基于hive数据库生成的DataFrame
nameandscoremysqlDataSourceDF.show();
reader.option("dbtable", "nameandage");
DataFrame nameandagemysqlDataSourceDF=reader.load();//基于hive2数据库生成的DataFrame
nameandagemysqlDataSourceDF.show();
//通过内容为Json的RDD来构造DataFrame
JavaPairRDD<String, Tuple2<Integer, Integer>> resultRDD = nameandscoremysqlDataSourceDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>((String) row.getAs("name"), (int)row.getLong(1));
}
}).join(nameandscoremysqlDataSourceDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>((String) row.getAs("name"), (int)row.getLong(1));
}
}));
JavaRDD<Row> resultRowRDD = resultRDD.map(new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception {
return RowFactory.create(tuple._1, tuple._2._2, tuple._2._1);
}
});
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
//构建StructType,用于最后DataFrame元数据的描述
StructType structType = DataTypes.createStructType(structFields);
DataFrame personsDF = sqlContext.createDataFrame(resultRowRDD, structType);
personsDF.show();
/**
* 1.当dataframe要把通过spark sql、core、ml等复杂操作后的数据写入数据库的时候 首先是权限的问题,必须确保数据库授权了当前操作spark sql的用户
* 2.Dataframe要写数据到db 的时候,一般都不可以直接写进去,而是要转成RDD,通过RDD写数据到db中
*/
personsDF.javaRDD().foreachPartition(new VoidFunction<Iterator<Row>>() {
@Override
public void call(Iterator<Row> t) throws SQLException {
Connection conn2mysql= null;
Statement statement = null;
try {
conn2mysql = DriverManager.getConnection("jdbc:mysql://master:3306/spark", "root", "123");
statement = conn2mysql.createStatement();
while(t.hasNext()){
String sql="insert into nameagescore (name,age,score) values (";
Row row = t.next();
String name = row.getAs("name");
int age = row.getInt(1);
int score = row.getInt(2);
sql +="'"+name+"',"+"'"+age+"',"+"'"+score+"')";
statement.execute(sql);
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if(conn2mysql != null){
conn2mysql.close();
}
if(statement != null){
statement.close();
}
}
}
});
}
}
submit提交参考:
http://www.cnblogs.com/kevin19900306/p/5602563.html
参考
http://heb.itcast.cn/news/20151229/16012088060.shtml
数据源:home/richard/slq/Spark/people.txt和/home/richard/slq/spark/peoplescores.txt两个文件。
people.txt的文件内容:
Michael 29
Andy 30
Justin 19
peoplescores.txt文件内容:
Michael 99
Andy 97
Justin 68
注:people.txt和peoplescores.txt的文件内容以Tab键为分隔符。
代码:
参考:
http://blog.csdn.net/slq1023/article/details/51108648
package com.dt.spark
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.hive.HiveContext
// 使用Java的方式开发实战对DataFrame的操作
object SparkSQL2Hive {
def main(args: Array[String]): Unit = {
val conf = new SparkConf //创建SparkConf对象
conf.setAppName("SparkSQL2Hive") //设置应用程序名
conf.setMaster("spark://master:7077") //设置集群的Master
val sc = new SparkContext //创建SparkContext对象,
//在目前企业级大数据Spark开发的时候,绝大多数情况下是采用Hive作为数据仓库
//Spark提供了HIve的支持功能,Spark通过HiveContext可以直接操作Hive中的数据
//基于HiveContext我们可以使用sql/hql两种方式才编写SQL语句对Hive进行操作,
//包括创建表、删除表、往表里导入数据 以及用SQL语法构造 各种SQL语句对表中的数据进行CRUD操作
//第二:也可以直接通过saveAsTable的方式把DaraFrame中的数据保存到Hive数据仓库中
//第三:可以直接通过HiveContext.table方法来直接加载Hive中的表而生成DataFrame
val hiveContext = new HiveContext(sc)
hiveContext.sql("use hive")
hiveContext.sql("DROP TABLE IF EXISTS people")
hiveContext.sql("CREATE TABLE IF NOT EXISTS people(name STRING,age INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/opt/spark-2.0.0-bin-hadoop2.7/dataSource/people.txt' INTO TABLE people")
//把本地数据加载到Hive中(背后实际上发生了数据的拷贝)
//当然也可以通过LOAD DATA INPATH去获得HDFS等上面的数据 到Hive(此时发生了数据的移动)
hiveContext.sql("DROP TABLE IF EXISTS peoplescores")
hiveContext.sql("CREATE TABLE IF NOT EXISTS peoplescores(name STRING,score INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/opt/spark-2.0.0-bin-hadoop2.7/dataSource/peoplescores.txt' INTO TABLE peoplescores")
//通过HiveContext使用join直接基于Hive中的两张表进行操作获得大于90分的人的name,age,score
val resultDF = hiveContext.sql("SELECT pi.name,pi.age,ps.score FROM people pi JOIN peoplescores ps ON pi.name=ps.name WHERE ps.score > 90")
//通过saveAsTable创建一张Hive Managed Table,数据放在什么地方、元数据都是Hive管理的
//当删除该表时,数据也会一起被删除(磁盘上的数据不再存在)
hiveContext.sql("DROP TABLE IF EXISTS peopleinformationresult")
resultDF.write.saveAsTable("peopleinformationresult")
//使用HivewContext的Table方法可以直接去读Hive中的Table并生成DaraFrame
//读取的数据就可以进行机器学习、图计算、各种复杂ETL等操作
val dataFrameHive = hiveContext.table("peopleinformationresult")
dataFrameHive.show()
}
}
package com.dt.spark
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.hive.HiveContext
// 使用Java的方式开发实战对DataFrame的操作
object SparkSQL2Hive {
def main(args: Array[String]): Unit = {
val conf = new SparkConf //创建SparkConf对象
conf.setAppName("SparkSQL2Hive") //设置应用程序名
conf.setMaster("spark://master:7077") //设置集群的Master
val sc = new SparkContext //创建SparkContext对象,
//在目前企业级大数据Spark开发的时候,绝大多数情况下是采用Hive作为数据仓库
//Spark提供了HIve的支持功能,Spark通过HiveContext可以直接操作Hive中的数据
//基于HiveContext我们可以使用sql/hql两种方式才编写SQL语句对Hive进行操作,
//包括创建表、删除表、往表里导入数据 以及用SQL语法构造 各种SQL语句对表中的数据进行CRUD操作
//第二:也可以直接通过saveAsTable的方式把DaraFrame中的数据保存到Hive数据仓库中
//第三:可以直接通过HiveContext.table方法来直接加载Hive中的表而生成DataFrame
val hiveContext = new HiveContext(sc)
hiveContext.sql("use hive")
hiveContext.sql("DROP TABLE IF EXISTS people")
hiveContext.sql("CREATE TABLE IF NOT EXISTS people(name STRING,age INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/opt/spark-1.4.0-bin-hadoop2.6/dataSource/people.txt' INTO TABLE people")
//把本地数据加载到Hive中(背后实际上发生了数据的拷贝)
//当然也可以通过LOAD DATA INPATH去获得HDFS等上面的数据 到Hive(此时发生了数据的移动)
hiveContext.sql("DROP TABLE IF EXISTS peoplescores")
hiveContext.sql("CREATE TABLE IF NOT EXISTS peoplescores(name STRING,score INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/opt/spark-1.4.0-bin-hadoop2.6/dataSource/peoplescores.txt' INTO TABLE peoplescores")
//通过HiveContext使用join直接基于Hive中的两张表进行操作获得大于90分的人的name,age,score
val resultDF = hiveContext.sql("SELECT pi.name,pi.age,ps.score FROM people pi LEFT JOIN peoplescores ps ON pi.name=ps.name WHERE ps.score > 90")
//通过saveAsTable创建一张Hive Managed Table,数据放在什么地方、元数据都是Hive管理的
//当删除该表时,数据也会一起被删除(磁盘上的数据不再存在)
hiveContext.sql("DROP TABLE IF EXISTS peopleinformationresult")
resultDF.saveAsTable("peopleinformationresult")
//使用HivewContext的Table方法可以直接去读Hive中的Table并生成DaraFrame
//读取的数据就可以进行机器学习、图计算、各种复杂ETL等操作
val dataFrameHive = hiveContext.table("peopleinformationresult")
dataFrameHive.show()
}
}
ERROR metadata.Hive: NoSuchObjectException(message:hive.peopleinformationresult table not found)
Exception in thread "main" java.lang.RuntimeException: path hdfs://master:9000/user/hive/warehouse/hive.db/peopleinformationresult already exists.