注意,这里使用的是scala 2.12.12,spark版本是最新的3.0.1版本
/**
* Returns a sort expression based on ascending order of the column.
* {
{
{
* df.sort(asc("dept"), desc("age"))
* }}}
*
* @group sort_funcs
* @since 1.3.0
*/
def asc(columnName: String): Column = Column(columnName).asc
/**
* Returns a sort expression based on ascending order of the column,
* and null values return before non-null values.
* {
{
{
* df.sort(asc_nulls_first("dept"), desc("age"))
* }}}
*
* @group sort_funcs
* @since 2.1.0
*/
def asc_nulls_first(columnName: String): Column = Column(columnName).asc_nulls_first
/**
* Returns a sort expression based on ascending order of the column,
* and null values appear after non-null values.
* {
{
{
* df.sort(asc_nulls_last("dept"), desc("age"))
* }}}
*
* @group sort_funcs
* @since 2.1.0
*/
def asc_nulls_last(columnName: String): Column = Column(columnName).asc_nulls_last
/**
* Returns a sort expression based on the descending order of the column.
* {
{
{
* df.sort(asc("dept"), desc("age"))
* }}}
*
* @group sort_funcs
* @since 1.3.0
*/
def desc(columnName: String): Column = Column(columnName).desc
/**
* Returns a sort expression based on the descending order of the column,
* and null values appear before non-null values.
* {
{
{
* df.sort(asc("dept"), desc_nulls_first("age"))
* }}}
*
* @group sort_funcs
* @since 2.1.0
*/
def desc_nulls_first(columnName: String): Column = Column(columnName).desc_nulls_first
/**
* Returns a sort expression based on the descending order of the column,
* and null values appear after non-null values.
* {
{
{
* df.sort(asc("dept"), desc_nulls_last("age"))
* }}}
*
* @group sort_funcs
* @since 2.1.0
*/
def desc_nulls_last(columnName: String): Column = Column(columnName).desc_nulls_last
/**
* @group agg_funcs
* @since 1.3.0
*/
@deprecated("Use approx_count_distinct", "2.1.0")
def approxCountDistinct(e: Column): Column = approx_count_distinct(e)
/**
* @group agg_funcs
* @since 1.3.0
*/
@deprecated("Use approx_count_distinct", "2.1.0")
def approxCountDistinct(columnName: String): Column = approx_count_distinct(columnName)
/**
* @group agg_funcs
* @since 1.3.0
*/
@deprecated("Use approx_count_distinct", "2.1.0")
def approxCountDistinct(e: Column, rsd: Double): Column = approx_count_distinct(e, rsd)
/**
* @group agg_funcs
* @since 1.3.0
*/
@deprecated("Use approx_count_distinct", "2.1.0")
def approxCountDistinct(columnName: String, rsd: Double): Column = {
approx_count_distinct(Column(columnName), rsd)
}
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @group agg_funcs
* @since 2.1.0
*/
def approx_count_distinct(e: Column): Column = withAggregateFunction {
HyperLogLogPlusPlus(e.expr)
}
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @group agg_funcs
* @since 2.1.0
*/
def approx_count_distinct(columnName: String): Column = approx_count_distinct(column(columnName))
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @param rsd maximum relative standard deviation allowed (default = 0.05)
*
* @group agg_funcs
* @since 2.1.0
*/
def approx_count_distinct(e: Column, rsd: Double): Column = withAggregateFunction {
HyperLogLogPlusPlus(e.expr, rsd, 0, 0)
}
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @param rsd maximum relative standard deviation allowed (default = 0.05)
*
* @group agg_funcs
* @since 2.1.0
*/
def approx_count_distinct(columnName: String, rsd: Double): Column = {
approx_count_distinct(Column(columnName), rsd)
}
/**
* Aggregate function: returns the average of the values in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def avg(e: Column): Column = withAggregateFunction {
Average(e.expr) }
/**
* Aggregate function: returns the average of the values in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def avg(columnName: String): Column = avg(Column(columnName))
/**
* Aggregate function: returns a list of objects with duplicates.
*
* @note The function is non-deterministic because the order of collected results depends
* on the order of the rows which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.6.0
*/
def collect_list(e: Column): Column = withAggregateFunction {
CollectList(e.expr) }
/**
* Aggregate function: returns a list of objects with duplicates.
*
* @note The function is non-deterministic because the order of collected results depends
* on the order of the rows which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.6.0
*/
def collect_list(columnName: String): Column = collect_list(Column(columnName))
/**
* Aggregate function: returns a set of objects with duplicate elements eliminated.
*
* @note The function is non-deterministic because the order of collected results depends
* on the order of the rows which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.6.0
*/
def collect_set(e: Column): Column = withAggregateFunction {
CollectSet(e.expr) }
/**
* Aggregate function: returns a set of objects with duplicate elements eliminated.
*
* @note The function is non-deterministic because the order of collected results depends
* on the order of the rows which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.6.0
*/
def collect_set(columnName: String): Column = collect_set(Column(columnName))
/**
* Aggregate function: returns the Pearson Correlation Coefficient for two columns.
*
* @group agg_funcs
* @since 1.6.0
*/
def corr(column1: Column, column2: Column): Column = withAggregateFunction {
Corr(column1.expr, column2.expr)
}
/**
* Aggregate function: returns the Pearson Correlation Coefficient for two columns.
*
* @group agg_funcs
* @since 1.6.0
*/
def corr(columnName1: String, columnName2: String): Column = {
corr(Column(columnName1), Column(columnName2))
}
/**
* Aggregate function: returns the number of items in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def count(e: Column): Column = withAggregateFunction {
e.expr match {
// Turn count(*) into count(1)
case s: Star => Count(Literal(1))
case _ => Count(e.expr)
}
}
/**
* Aggregate function: returns the number of items in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def count(columnName: String): TypedColumn[Any, Long] =
count(Column(columnName)).as(ExpressionEncoder[Long]())
/**
* Aggregate function: returns the number of distinct items in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
@scala.annotation.varargs
def countDistinct(expr: Column, exprs: Column*): Column =
// For usage like countDistinct("*"), we should let analyzer expand star and
// resolve function.
Column(UnresolvedFunction("count", (expr +: exprs).map(_.expr), isDistinct = true))
/**
* Aggregate function: returns the number of distinct items in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
@scala.annotation.varargs
def countDistinct(columnName: String, columnNames: String*): Column =
countDistinct(Column(columnName), columnNames.map(Column.apply) : _*)
/**
* Aggregate function: returns the population covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_pop(column1: Column, column2: Column): Column = withAggregateFunction {
CovPopulation(column1.expr, column2.expr)
}
/**
* Aggregate function: returns the population covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_pop(columnName1: String, columnName2: String): Column = {
covar_pop(Column(columnName1), Column(columnName2))
}
/**
* Aggregate function: returns the sample covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_samp(column1: Column, column2: Column): Column = withAggregateFunction {
CovSample(column1.expr, column2.expr)
}
/**
* Aggregate function: returns the sample covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_samp(columnName1: String, columnName2: String): Column = {
covar_samp(Column(columnName1), Column(columnName2))
}
/**
* Aggregate function: returns the first value in a group.
*
* The function by default returns the first values it sees. It will return the first non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 2.0.0
*/
def first(e: Column, ignoreNulls: Boolean): Column = withAggregateFunction {
First(e.expr, ignoreNulls)
}
/**
* Aggregate function: returns the first value of a column in a group.
*
* The function by default returns the first values it sees. It will return the first non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 2.0.0
*/
def first(columnName: String, ignoreNulls: Boolean): Column = {
first(Column(columnName), ignoreNulls)
}
/**
* Aggregate function: returns the first value in a group.
*
* The function by default returns the first values it sees. It will return the first non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.3.0
*/
def first(e: Column): Column = first(e, ignoreNulls = false)
/**
* Aggregate function: returns the first value of a column in a group.
*
* The function by default returns the first values it sees. It will return the first non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.3.0
*/
def first(columnName: String): Column = first(Column(columnName))
/**
* Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
* or not, returns 1 for aggregated or 0 for not aggregated in the result set.
*
* @group agg_funcs
* @since 2.0.0
*/
def grouping(e: Column): Column = Column(Grouping(e.expr))
/**
* Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
* or not, returns 1 for aggregated or 0 for not aggregated in the result set.
*
* @group agg_funcs
* @since 2.0.0
*/
def grouping(columnName: String): Column = grouping(Column(columnName))
/**
* Aggregate function: returns the level of grouping, equals to
*
* {
{
{
* (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
* }}}
*
* @note The list of columns should match with grouping columns exactly, or empty (means all the
* grouping columns).
*
* @group agg_funcs
* @since 2.0.0
*/
def grouping_id(cols: Column*): Column = Column(GroupingID(cols.map(_.expr)))
/**
* Aggregate function: returns the level of grouping, equals to
*
* {
{
{
* (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
* }}}
*
* @note The list of columns should match with grouping columns exactly.
*
* @group agg_funcs
* @since 2.0.0
*/
def grouping_id(colName: String, colNames: String*): Column = {
grouping_id((Seq(colName) ++ colNames).map(n => Column(n)) : _*)
}
/**
* Aggregate function: returns the kurtosis of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def kurtosis(e: Column): Column = withAggregateFunction {
Kurtosis(e.expr) }
/**
* Aggregate function: returns the kurtosis of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
/**
* Aggregate function: returns the last value in a group.
*
* The function by default returns the last values it sees. It will return the last non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 2.0.0
*/
def last(e: Column, ignoreNulls: Boolean): Column = withAggregateFunction {
new Last(e.expr, ignoreNulls)
}
/**
* Aggregate function: returns the last value of the column in a group.
*
* The function by default returns the last values it sees. It will return the last non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 2.0.0
*/
def last(columnName: String, ignoreNulls: Boolean): Column = {
last(Column(columnName), ignoreNulls)
}
/**
* Aggregate function: returns the last value in a group.
*
* The function by default returns the last values it sees. It will return the last non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.3.0
*/
def last(e: Column): Column = last(e, ignoreNulls = false)
/**
* Aggregate function: returns the last value of the column in a group.
*
* The function by default returns the last values it sees. It will return the last non-null
* value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
*
* @note The function is non-deterministic because its results depends on the order of the rows
* which may be non-deterministic after a shuffle.
*
* @group agg_funcs
* @since 1.3.0
*/
def last(columnName: String): Column = last(Column(columnName), ignoreNulls = false)
/**
* Aggregate function: returns the maximum value of the expression in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def max(e: Column): Column = withAggregateFunction {
Max(e.expr) }
/**
* Aggregate function: returns the maximum value of the column in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def max(columnName: String): Column = max(Column(columnName))
/**
* Aggregate function: returns the average of the values in a group.
* Alias for avg.
*
* @group agg_funcs
* @since 1.4.0
*/
def mean(e: Column): Column = avg(e)
/**
* Aggregate function: returns the average of the values in a group.
* Alias for avg.
*
* @group agg_funcs
* @since 1.4.0
*/
def mean(columnName: String): Column = avg(columnName)
/**
* Aggregate function: returns the minimum value of the expression in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def min(e: Column): Column = withAggregateFunction {
Min(e.expr) }
/**
* Aggregate function: returns the minimum value of the column in a group.
*
* @group agg_funcs
* @since 1.3.0
*/
def min(columnName: String): Column = min(Column(columnName))
/**
* Aggregate function: returns the skewness of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def skewness(e: Column): Column = withAggregateFunction {
Skewness(e.expr) }
/**
* Aggregate function: returns the skewness of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def skewness(columnName: String): Column = skewness(Column(columnName))
/**
* Aggregate function: alias for `stddev_samp`.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev(e: Column): Column = withAggregateFunction {
StddevSamp(e.expr) }
/**
* Aggregate function: alias for `stddev_samp`.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev(columnName: String): Column = stddev(Column(columnName))
/**
* Aggregate function: returns the sample standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_samp(e: Column): Column = withAggregateFunction {
StddevSamp(e.expr) }
/**
* Aggregate function: returns the sample standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
/**
* Aggregate function: returns the population standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_pop(e: Column): Column = withAggregateFunction {
StddevPop(e.expr) }
/**
* Aggregate function: returns the population standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
/**
* Aggregate function: returns the sum of all values in the expression.
*
* @group agg_funcs
* @since 1.3.0
*/
def sum(e: Column): Column = withAggregateFunction {
Sum(e.expr) }
/**
* Aggregate function: returns the sum of all values in the given column.
*
* @group agg_funcs
* @since 1.3.0
*/
def sum(columnName: String): Column = sum(Column(columnName))
/**
* Aggregate function: returns the sum of distinct values in the expression.
*
* @group agg_funcs
* @since 1.3.0
*/
def sumDistinct(e: Column): Column = withAggregateFunction(Sum(e.expr), isDistinct = true)
/**
* Aggregate function: returns the sum of distinct values in the expression.
*
* @group agg_funcs
* @since 1.3.0
*/
def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName))
/**
* Aggregate function: alias for `var_samp`.
*
* @group agg_funcs
* @since 1.6.0
*/
def variance(e: Column): Column = withAggregateFunction {
VarianceSamp(e.expr) }
/**
* Aggregate function: alias for `var_samp`.
*
* @group agg_funcs
* @since 1.6.0
*/
def variance(columnName: String): Column = variance(Column(columnName))
/**
* Aggregate function: returns the unbiased variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_samp(e: Column): Column = withAggregateFunction {
VarianceSamp(e.expr) }
/**
* Aggregate function: returns the unbiased variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_samp(columnName: String): Column = var_samp(Column(columnName))
/**
* Aggregate function: returns the population variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_pop(e: Column): Column = withAggregateFunction {
VariancePop(e.expr) }
/**
* Aggregate function: returns the population variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_pop(columnName: String): Column = var_pop(Column(columnName))
/**
* Window function: returns the cumulative distribution of values within a window partition,
* i.e. the fraction of rows that are below the current row.
*
* {
{
{
* N = total number of rows in the partition
* cumeDist(x) = number of values before (and including) x / N
* }}}
*
* @group window_funcs
* @since 1.6.0
*/
def cume_dist(): Column = withExpr {
new CumeDist }
/**
* Window function: returns the rank of rows within a window partition, without any gaps.
*
* The difference between rank and dense_rank is that denseRank leaves no gaps in ranking
* sequence when there are ties. That is, if you were ranking a competition using dense_rank
* and had three people tie for second place, you would say that all three were in second
* place and that the next person came in third. Rank would give me sequential numbers, making
* the person that came in third place (after the ties) would register as coming in fifth.
*
* This is equivalent to the DENSE_RANK function in SQL.
*
* @group window_funcs
* @since 1.6.0
*/
def dense_rank(): Column = withExpr {
new DenseRank }
/**
* Window function: returns the value that is `offset` rows before the current row, and
* `null` if there is less than `offset` rows before the current row. For example,
* an `offset` of one will return the previous row at any given point in the window partition.
*
* This is equivalent to the LAG function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lag(e: Column, offset: Int): Column = lag(e, offset, null)
/**
* Window function: returns the value that is `offset` rows before the current row, and
* `null` if there is less than `offset` rows before the current row. For example,
* an `offset` of one will return the previous row at any given point in the window partition.
*
* This is equivalent to the LAG function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lag(columnName: String, offset: Int): Column = lag(columnName, offset, null)
/**
* Window function: returns the value that is `offset` rows before the current row, and
* `defaultValue` if there is less than `offset` rows before the current row. For example,
* an `offset` of one will return the previous row at any given point in the window partition.
*
* This is equivalent to the LAG function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lag(columnName: String, offset: Int, defaultValue: Any): Column = {
lag(Column(columnName), offset, defaultValue)
}
/**
* Window function: returns the value that is `offset` rows before the current row, and
* `defaultValue` if there is less than `offset` rows before the current row. For example,
* an `offset` of one will return the previous row at any given point in the window partition.
*
* This is equivalent to the LAG function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lag(e: Column, offset: Int, defaultValue: Any): Column = withExpr {
Lag(e.expr, Literal(offset), Literal(defaultValue))
}
/**
* Window function: returns the value that is `offset` rows after the current row, and
* `null` if there is less than `offset` rows after the current row. For example,
* an `offset` of one will return the next row at any given point in the window partition.
*
* This is equivalent to the LEAD function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lead(columnName: String, offset: Int): Column = {
lead(columnName, offset, null) }
/**
* Window function: returns the value that is `offset` rows after the current row, and
* `null` if there is less than `offset` rows after the current row. For example,
* an `offset` of one will return the next row at any given point in the window partition.
*
* This is equivalent to the LEAD function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lead(e: Column, offset: Int): Column = {
lead(e, offset, null) }
/**
* Window function: returns the value that is `offset` rows after the current row, and
* `defaultValue` if there is less than `offset` rows after the current row. For example,
* an `offset` of one will return the next row at any given point in the window partition.
*
* This is equivalent to the LEAD function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lead(columnName: String, offset: Int, defaultValue: Any): Column = {
lead(Column(columnName), offset, defaultValue)
}
/**
* Window function: returns the value that is `offset` rows after the current row, and
* `defaultValue` if there is less than `offset` rows after the current row. For example,
* an `offset` of one will return the next row at any given point in the window partition.
*
* This is equivalent to the LEAD function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def lead(e: Column, offset: Int, defaultValue: Any): Column = withExpr {
Lead(e.expr, Literal(offset), Literal(defaultValue))
}
/**
* Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window
* partition. For example, if `n` is 4, the first quarter of the rows will get value 1, the second
* quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
*
* This is equivalent to the NTILE function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def ntile(n: Int): Column = withExpr {
new NTile(Literal(n)) }
/**
* Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
*
* This is computed by:
* {
{
{
* (rank of row in its partition - 1) / (number of rows in the partition - 1)
* }}}
*
* This is equivalent to the PERCENT_RANK function in SQL.
*
* @group window_funcs
* @since 1.6.0
*/
def percent_rank(): Column = withExpr {
new PercentRank }
/**
* Window function: returns the rank of rows within a window partition.
*
* The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
* sequence when there are ties. That is, if you were ranking a competition using dense_rank
* and had three people tie for second place, you would say that all three were in second
* place and that the next person came in third. Rank would give me sequential numbers, making
* the person that came in third place (after the ties) would register as coming in fifth.
*
* This is equivalent to the RANK function in SQL.
*
* @group window_funcs
* @since 1.4.0
*/
def rank(): Column = withExpr {
new Rank }
/**
* Window function: returns a sequential number starting at 1 within a window partition.
*
* @group window_funcs
* @since 1.6.0
*/
def row_number(): Column = withExpr {
RowNumber() }
//
// Non-aggregate functions
//
/**
* Creates a new array column. The input columns must all have the same data type.
*
* @group normal_funcs
* @since 1.4.0
*/
@scala.annotation.varargs
def array(cols: Column*): Column = withExpr {
CreateArray(cols.map(_.expr)) }
/**
* Creates a new array column. The input columns must all have the same data type.
*
* @group normal_funcs
* @since 1.4.0
*/
@scala.annotation.varargs
def array(colName: String, colNames: String*): Column = {
array((colName +: colNames).map(col) : _*)
}
/**
* Creates a new map column. The input columns must be grouped as key-value pairs, e.g.
* (key1, value1, key2, value2, ...). The key columns must all have the same data type, and can't
* be null. The value columns must all have the same data type.
*
* @group normal_funcs
* @since 2.0
*/
@scala.annotation.varargs
def map(cols: Column*): Column = withExpr {
CreateMap(cols.map(_.expr)) }
/**
* Creates a new map column. The array in the first column is used for keys. The array in the
* second column is used for values. All elements in the array for key should not be null.
*
* @group normal_funcs
* @since 2.4
*/
def map_from_arrays(keys: Column, values: Column): Column = withExpr {
MapFromArrays(keys.expr, values.expr)
}
/**
* Marks a DataFrame as small enough for use in broadcast joins.
*
* The following example marks the right DataFrame for broadcast hash join using `joinKey`.
* {
{
{
* // left and right are DataFrames
* left.join(broadcast(right), "joinKey")
* }}}
*
* @group normal_funcs
* @since 1.5.0
*/
def broadcast[T](df: Dataset[T]): Dataset[T] = {
Dataset[T](df.sparkSession,
ResolvedHint(df.logicalPlan, HintInfo(strategy = Some(BROADCAST))))(df.exprEnc)
}
/**
* Returns the first column that is not null, or null if all inputs are null.
*
* For example, `coalesce(a, b, c)` will return a if a is not null,
* or b if a is null and b is not null, or c if both a and b are null but c is not null.
*
* @group normal_funcs
* @since 1.3.0
*/
@scala.annotation.varargs
def coalesce(e: Column*): Column = withExpr {
Coalesce(e.map(_.expr)) }
/**
* Creates a string column for the file name of the current Spark task.
*
* @group normal_funcs
* @since 1.6.0
*/
def input_file_name(): Column = withExpr {
InputFileName() }
/**
* Return true iff the column is NaN.
*
* @group normal_funcs
* @since 1.6.0
*/
def isnan(e: Column): Column = withExpr {
IsNaN(e.expr) }
/**
* Return true iff the column is null.
*
* @group normal_funcs
* @since 1.6.0
*/
def isnull(e: Column): Column = withExpr {
IsNull(e.expr) }
/**
* A column expression that generates monotonically increasing 64-bit integers.
*
* The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
* The current implementation puts the partition ID in the upper 31 bits, and the record number
* within each partition in the lower 33 bits. The assumption is that the data frame has
* less than 1 billion partitions, and each partition has less than 8 billion records.
*
* As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
*
* {
{
{
* 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
* }}}
*
* @group normal_funcs
* @since 1.4.0
*/
@deprecated("Use monotonically_increasing_id()", "2.0.0")
def monotonicallyIncreasingId(): Column = monotonically_increasing_id()
/**
* A column expression that generates monotonically increasing 64-bit integers.
*
* The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
* The current implementation puts the partition ID in the upper 31 bits, and the record number
* within each partition in the lower 33 bits. The assumption is that the data frame has
* less than 1 billion partitions, and each partition has less than 8 billion records.
*
* As an example, consider a `DataFrame` with two partitions, each with 3 records.
* This expression would return the following IDs:
*
* {
{
{
* 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
* }}}
*
* @group normal_funcs
* @since 1.6.0
*/
def monotonically_increasing_id(): Column = withExpr {
MonotonicallyIncreasingID() }
/**
* Returns col1 if it is not NaN, or col2 if col1 is NaN.
*
* Both inputs should be floating point columns (DoubleType or FloatType).
*
* @group normal_funcs
* @since 1.5.0
*/
def nanvl(col1: Column, col2: Column): Column = withExpr {
NaNvl(col1.expr, col2.expr) }
/**
* Unary minus, i.e. negate the expression.
* {
{
{
* // Select the amount column and negates all values.
* // Scala:
* df.select( -df("amount") )
*
* // Java:
* df.select( negate(df.col("amount")) );
* }}}
*
* @group normal_funcs
* @since 1.3.0
*/
def negate(e: Column): Column = -e
/**
* Inversion of boolean expression, i.e. NOT.
* {
{
{
* // Scala: select rows that are not active (isActive === false)
* df.filter( !df("isActive") )
*
* // Java:
* df.filter( not(df.col("isActive")) );
* }}}
*
* @group normal_funcs
* @since 1.3.0
*/
def not(e: Column): Column = !e
/**
* Generate a random column with independent and identically distributed (i.i.d.) samples
* uniformly distributed in [0.0, 1.0).
*
* @note The function is non-deterministic in general case.
*
* @group normal_funcs
* @since 1.4.0
*/
def rand(seed: Long): Column = withExpr {
Rand(seed) }
/**
* Generate a random column with independent and identically distributed (i.i.d.) samples
* uniformly distributed in [0.0, 1.0).
*
* @note The function is non-deterministic in general case.
*
* @group normal_funcs
* @since 1.4.0
*/
def rand(): Column = rand(Utils.random.nextLong)
/**
* Generate a column with independent and identically distributed (i.i.d.) samples from
* the standard normal distribution.
*
* @note The function is non-deterministic in general case.
*
* @group normal_funcs
* @since 1.4.0
*/
def randn(seed: Long): Column = withExpr {
Randn(seed) }
/**
* Generate a column with independent and identically distributed (i.i.d.) samples from
* the standard normal distribution.
*
* @note The function is non-deterministic in general case.
*
* @group normal_funcs
* @since 1.4.0
*/
def randn(): Column = randn(Utils.random.nextLong)
/**
* Partition ID.
*
* @note This is non-deterministic because it depends on data partitioning and task scheduling.
*
* @group normal_funcs
* @since 1.6.0
*/
def spark_partition_id(): Column = withExpr {
SparkPartitionID() }
/**
* Computes the square root of the specified float value.
*
* @group math_funcs
* @since 1.3.0
*/
def sqrt(e: Column): Column = withExpr {
Sqrt(e.expr) }
/**
* Computes the square root of the specified float value.
*
* @group math_funcs
* @since 1.5.0
*/
def sqrt(colName: String): Column = sqrt(Column(colName))
/**
* Creates a new struct column.
* If the input column is a column in a `DataFrame`, or a derived column expression
* that is named (i.e. aliased), its name would be retained as the StructField's name,
* otherwise, the newly generated StructField's name would be auto generated as
* `col` with a suffix `index + 1`, i.e. col1, col2, col3, ...
*
* @group normal_funcs
* @since 1.4.0
*/
@scala.annotation.varargs
def struct(cols: Column*): Column = withExpr {
CreateStruct.create(cols.map(_.expr)) }
/**
* Creates a new struct column that composes multiple input columns.
*
* @group normal_funcs
* @since 1.4.0
*/
@scala.annotation.varargs
def struct(colName: String, colNames