在之前的文章中介绍过 Spark 1.* 的Command 命令的执行逻辑,到 Spark2(3) 部分代码发生了变化, DataFrame 没了。 都是在 LogicalPlan 对象生成时,进行 side effects 注入执行,但是原来的方式绕了几个弯,执行了execute()
和doExecute()
,这里直接执行 queryExecution.executeCollect()
, 来的更直接一些。
// SparkSession
def sql(sqlText: String): DataFrame = {
val tracker = new QueryPlanningTracker
val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
sessionState.sqlParser.parsePlan(sqlText) // 1
}
Dataset.ofRows(self, plan, tracker)
}
// Dataset
def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = {
val qe = sparkSession.sessionState.executePlan(logicalPlan) // 2
qe.assertAnalyzed()
new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) // 3
}
@transient private[sql] val logicalPlan: LogicalPlan = {
// For various commands (like DDL) and queries with side effects, we force query execution
// to happen right away to let these side effects take place eagerly.
val plan = queryExecution.analyzed match {
case c: Command =>
LocalRelation(c.output, withAction("command", queryExecution)(_.executeCollect())) // 4
case u @ Union(children) if children.forall(_.isInstanceOf[Command]) =>
LocalRelation(u.output, withAction("command", queryExecution)(_.executeCollect()))
case _ =>
queryExecution.analyzed
}
if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) {
plan.setTagValue(Dataset.DATASET_ID_TAG, id)
}
plan
}
'AnalyzeTableStatement [rel_est_ds_table], false
queryExecution.executeCollect()
即 ExecutedCommandExec.executeCollect()
实际上就是在执行内部对Command的run方法了。AnalyzeTableCommand
// ExecutedCommandExec
override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
val converter = CatalystTypeConverters.createToCatalystConverter(schema)
cmd.run(sqlContext.sparkSession).map(converter(_).asInstanceOf[InternalRow]) // 这里cmd: RunnableCommand 就是每个具体要执行的Command
}
analyze 可以实现数据对数据表的size, row_number 的统计,并存储到元数据中,对后续cbo 优化有非常重要的作用
作用和统计表 stats信息类似。
tableDf.filter(Column(filter)).groupBy(partitionColumns: _*).count()
程序入口:
val relationStats = spark.table(tbl).queryExecution.optimizedPlan.stats
// LogicalPlanStats
def stats: Statistics = statsCache.getOrElse {
if (conf.cboEnabled) {
statsCache = Option(BasicStatsPlanVisitor.visit(self))
} else {
statsCache = Option(SizeInBytesOnlyStatsPlanVisitor.visit(self))
}
statsCache.get
}
trait LogicalPlanStats 本身也是一个LogicalPlan 对象,在启用 cbo 时,使用 BasicStatsPlanVisitor 来生成 stats 信息,在不启用 cbo 时,使用 SizeInBytesOnlyStatsPlanVisitor
生成stats 信息。
LogicalPlanVisitor
根据不同的plan 会有不同的visit方法来实现 stats 信息生成。SizeInBytesOnlyStatsPlanVisitor
用于计算各种operation 的 size,BasicStatsPlanVisitor
在开启cbo后,会针对 aggregate, join ,project 等operation 进行独立的 estimate() 计算,结果包括 sizeInBytes, rowCount, attributeStats。LogicalPlanVisitor 采用 visitor 设计模式来计算 stats 信息
LogicalRelation 是对 CatalogTable 的一种封装,另外补充来 stats 元数据信息,output 输出的信息,attributeMap 相关属性信息。
// SizeInBytesOnlyStatsPlanVisitor
override def default(p: LogicalPlan): Statistics = p match {
case p: LeafNode => p.computeStats()
case _: LogicalPlan => Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).product)
}
预估一行数据的大小
// LogicalRelation
override def computeStats(): Statistics = {
catalogTable
.flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled || conf.planStatsEnabled)))
.getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
}
// CatalogStatistics : 从catelog 中取到的table 的stats 信息
// stats = {Some@17701} "Some(CatalogStatistics(4464,Some(1000),Map()))"
// value = {CatalogStatistics@17709} "CatalogStatistics(4464,Some(1000),Map())"
// sizeInBytes = {BigInt@17711} "4464"
// rowCount = {Some@17712} "Some(1000)"
// colStats = {Map$EmptyMap$@17713} "Map$EmptyMap$" size = 0
/**
* Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
* on column names.
*/
def toPlanStats(planOutput: Seq[Attribute], planStatsEnabled: Boolean): Statistics = {
if (planStatsEnabled && rowCount.isDefined) {
val attrStats = AttributeMap(planOutput
.flatMap(a => colStats.get(a.name).map(a -> _.toPlanStat(a.name, a.dataType))))
// Estimate size as number of rows * row size.
val size = EstimationUtils.getOutputSize(planOutput, rowCount.get, attrStats)
Statistics(sizeInBytes = size, rowCount = rowCount, attributeStats = attrStats)
} else {
// When plan statistics are disabled or the table doesn't have other statistics,
// we apply the size-only estimation strategy and only propagate sizeInBytes in statistics.
Statistics(sizeInBytes = sizeInBytes)
}
}
// EstimationUtils
def getOutputSize(
attributes: Seq[Attribute],
outputRowCount: BigInt,
attrStats: AttributeMap[ColumnStat] = AttributeMap(Nil)): BigInt = {
// Output size can't be zero, or sizeInBytes of BinaryNode will also be zero
// (simple computation of statistics returns product of children).
if (outputRowCount > 0) outputRowCount * getSizePerRow(attributes, attrStats) else 1
}
CREATE TABLE `a`(`id` INT, `name` STRING);
CREATE TABLE `b`(`id` INT, `name` STRING) partitioned by (dt string);
load data local inpath '/tmp/a' into table a;
insert overwrite table b partition(dt='20200305')
select id,name from a;
select * from a;
1 wankun
2 zhangsan
3 lisi
mysql> SELECT * FROM TABLE_PARAMS WHERE TBL_ID=12 AND PARAM_KEY not like '%schema%' ;
+--------+--------------------------+----------------+
| TBL_ID | PARAM_KEY | PARAM_VALUE |
+--------+--------------------------+----------------+
| 12 | COLUMN_STATS_ACCURATE | true |
| 12 | numFiles | 1 |
| 12 | spark.sql.create.version | 2.4.6-SNAPSHOT |
| 12 | totalSize | 27 |
| 12 | transient_lastDdlTime | 1583396359 |
+--------+--------------------------+----------------+
mysql> SELECT * FROM PARTITION_PARAMS ;
+---------+-----------------------+-------------+
| PART_ID | PARAM_KEY | PARAM_VALUE |
+---------+-----------------------+-------------+
| 2 | COLUMN_STATS_ACCURATE | false |
| 2 | numFiles | 2 |
| 2 | numRows | -1 |
| 2 | rawDataSize | -1 |
| 2 | totalSize | 27 |
| 2 | transient_lastDdlTime | 1583396367 |
+---------+-----------------------+-------------+
ANALYZE TABLE a COMPUTE STATISTICS NOSCAN;
ANALYZE TABLE b COMPUTE STATISTICS NOSCAN;
ANALYZE TABLE a COMPUTE STATISTICS;
ANALYZE TABLE b COMPUTE STATISTICS;
ANALYZE TABLE b partition (dt=20200305) COMPUTE STATISTICS;
做完分析后,我们可以看到表的两个重要分析属性 spark.sql.statistics.numRows
和 spark.sql.statistics.totalSize
。
mysql> SELECT * FROM TABLE_PARAMS WHERE TBL_ID=12 AND PARAM_KEY not like '%schema%' ;
+--------+--------------------------------+----------------+
| TBL_ID | PARAM_KEY | PARAM_VALUE |
+--------+--------------------------------+----------------+
| 12 | COLUMN_STATS_ACCURATE | false |
| 12 | numFiles | 1 |
| 12 | numRows | -1 |
| 12 | rawDataSize | -1 |
| 12 | spark.sql.create.version | 2.4.6-SNAPSHOT |
| 12 | spark.sql.statistics.numRows | 3 |
| 12 | spark.sql.statistics.totalSize | 27 |
| 12 | totalSize | 27 |
| 12 | transient_lastDdlTime | 1583397179 |
+--------+--------------------------------+----------------+
9 rows in set (0.00 sec)
mysql> SELECT * FROM PARTITION_PARAMS ;
+---------+--------------------------------+-------------+
| PART_ID | PARAM_KEY | PARAM_VALUE |
+---------+--------------------------------+-------------+
| 2 | COLUMN_STATS_ACCURATE | false |
| 2 | numFiles | 2 |
| 2 | numRows | -1 |
| 2 | rawDataSize | -1 |
| 2 | spark.sql.statistics.numRows | 3 |
| 2 | spark.sql.statistics.totalSize | 27 |
| 2 | totalSize | 27 |
| 2 | transient_lastDdlTime | 1583399862 |
+---------+--------------------------------+-------------+
spark.sql("insert overwrite table user_part partition(year='2021') select id, name, true, cast(88.8 as decimal(13, 2)) from user1")
analyze table user_part partition(year='2021') compute statistics for columns id;
analyze table user_part partition(year='2021') compute statistics for columns name;
analyze table user_part partition(year='2021') compute statistics for columns is_student;
analyze table user_part partition(year='2021') compute statistics for columns salary;
0: jdbc:hive2://127.0.0.1:10000/default> desc extended user_part id;
+-----------------+-------------+
| info_name | info_value |
+-----------------+-------------+
| col_name | id |
| data_type | int |
| comment | NULL |
| min | 1 |
| max | 99 |
| num_nulls | 0 |
| distinct_count | 98 |
| avg_col_len | 4 |
| max_col_len | 4 |
| histogram | NULL |
+-----------------+-------------+
10 rows selected (0.146 seconds)
0: jdbc:hive2://127.0.0.1:10000/default> desc extended user_part name;
+-----------------+-------------+
| info_name | info_value |
+-----------------+-------------+
| col_name | name |
| data_type | string |
| comment | NULL |
| min | NULL |
| max | NULL |
| num_nulls | 0 |
| distinct_count | 65 |
| avg_col_len | 9 |
| max_col_len | 9 |
| histogram | NULL |
+-----------------+-------------+
10 rows selected (0.16 seconds)
0: jdbc:hive2://127.0.0.1:10000/default> desc extended user_part is_student;
+-----------------+-------------+
| info_name | info_value |
+-----------------+-------------+
| col_name | is_student |
| data_type | boolean |
| comment | NULL |
| min | true |
| max | true |
| num_nulls | 0 |
| distinct_count | 1 |
| avg_col_len | 1 |
| max_col_len | 1 |
| histogram | NULL |
+-----------------+-------------+
10 rows selected (0.133 seconds)
0: jdbc:hive2://127.0.0.1:10000/default> desc extended user_part salary;
+-----------------+----------------+
| info_name | info_value |
+-----------------+----------------+
| col_name | salary |
| data_type | decimal(13,2) |
| comment | NULL |
| min | 88.80 |
| max | 88.80 |
| num_nulls | 0 |
| distinct_count | 1 |
| avg_col_len | 8 |
| max_col_len | 8 |
| histogram | NULL |
+-----------------+----------------+
10 rows selected (0.182 seconds)
0: jdbc:hive2://127.0.0.1:10000/default>
set spark.sql.statistics.histogram.enabled =true;
可以统计字段的 histogram 信息TBL_ID | PARAM_KEY | PARAM_VALUE |
---|---|---|
97 | spark.sql.statistics.colStats.id.avgLen | 4 |
97 | spark.sql.statistics.colStats.id.distinctCount | 98 |
97 | spark.sql.statistics.colStats.id.histogram | xxx |
97 | spark.sql.statistics.colStats.id.max | 99 |
97 | spark.sql.statistics.colStats.id.maxLen | 4 |
97 | spark.sql.statistics.colStats.id.min | 1 |
97 | spark.sql.statistics.colStats.id.nullCount | 0 |
97 | spark.sql.statistics.colStats.id.version | 2 |
97 | spark.sql.statistics.colStats.is_student.avgLen | 1 |
97 | spark.sql.statistics.colStats.is_student.distinctCount | 1 |
97 | spark.sql.statistics.colStats.is_student.max | TRUE |
97 | spark.sql.statistics.colStats.is_student.maxLen | 1 |
97 | spark.sql.statistics.colStats.is_student.min | TRUE |
97 | spark.sql.statistics.colStats.is_student.nullCount | 0 |
97 | spark.sql.statistics.colStats.is_student.version | 2 |
97 | spark.sql.statistics.colStats.name.avgLen | 9 |
97 | spark.sql.statistics.colStats.name.distinctCount | 65 |
97 | spark.sql.statistics.colStats.name.maxLen | 9 |
97 | spark.sql.statistics.colStats.name.nullCount | 0 |
97 | spark.sql.statistics.colStats.name.version | 2 |
97 | spark.sql.statistics.colStats.salary.avgLen | 8 |
97 | spark.sql.statistics.colStats.salary.distinctCount | 1 |
97 | spark.sql.statistics.colStats.salary.histogram | xxx |
97 | spark.sql.statistics.colStats.salary.max | 88.8 |
97 | spark.sql.statistics.colStats.salary.maxLen | 8 |
97 | spark.sql.statistics.colStats.salary.min | 88.8 |
97 | spark.sql.statistics.colStats.salary.nullCount | 0 |
97 | spark.sql.statistics.colStats.salary.version | 2 |
97 | spark.sql.statistics.numRows | 99 |
97 | spark.sql.statistics.totalSize | 2071 |
97 | spark.sql.statistics.colStats.salary.max | 88.8 |
97 | spark.sql.statistics.colStats.salary.maxLen | 8 |
97 | spark.sql.statistics.colStats.salary.min | 88.8 |
97 | spark.sql.statistics.colStats.salary.nullCount | 0 |
97 | spark.sql.statistics.colStats.salary.version | 2 |
97 | spark.sql.statistics.numRows | 99 |
97 | spark.sql.statistics.totalSize | 2071 |
describe extended a;
describe extended b;
describe extended b partition (dt=20200305);