vi employees
1,George,nan
2,honey,nv
3,georgedage,nan
4,kangkang,nv
hdfs dfs -mkdir /second
hdfs dfs -put employees /second/
create external table employees(emp_no int,emp_name String,emp_gender String)
row format delimited fields terminated by ","
location "/second";
+------+----------+----------+
|emp_no| emp_name|emp_gender|
+------+----------+----------+
| 1| George| nan|
| 2| honey| nv|
| 3|georgedage| nan|
| 4| kangkang| nv|
+------+----------+----------+
按照性别分组,并打出所有名字。
+----------+-----------------+
|emp_gender| name|
+----------+-----------------+
| nv| honey|kangkang|
| nan|George|georgedage|
+----------+-----------------+
详细行转列:
https://blog.csdn.net/qq_41946557/article/details/102904642
package com.henu
import org.apache.spark.sql.SparkSession
object HiveDemo {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local[4]")
.appName("hd").enableHiveSupport().getOrCreate()
session.sparkContext.setLogLevel("error")
session.sql("use spark")
val frame = session.sql("select emp_gender,concat_ws('|'," +
"collect_set(employees.emp_name)) name from employees group by emp_gender")
frame.show()
session.stop()
}
}
+----------+-----------------+
|emp_gender| name|
+----------+-----------------+
| nv| honey|kangkang|
| nan|George|georgedage|
+----------+-----------------+
使用UDAF
package com.henu
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}
class NameUDAF extends UserDefinedAggregateFunction{
//输入数据的结构类型
override def inputSchema: StructType = {
StructType(List(StructField("emp_name",StringType)))
}
//缓冲区数据的结构类型
override def bufferSchema: StructType = StructType(List(StructField("emp_name",StringType)))
//返回值的类型
override def dataType: DataType = StringType
//都可以
override def deterministic: Boolean = true
//初始化的操作,初始值赋值为空
override def initialize(buffer: MutableAggregationBuffer): Unit = buffer.update(0,"")
//在worker中每一个分区中进行的操作
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
//获取原先的值
var bfValue=buffer.getString(0)
//新传递过来的数据
val nowValue = input.getString(0)
if (bfValue == ""){
bfValue=nowValue
}
else{
bfValue += ","+nowValue
}
//把合并后的数据再存放到缓冲区中
buffer.update(0,bfValue)
}
//合并rdd中所有的数据,为一个数据
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
var bfValue=buffer1.getString(0)
val nowValue = buffer2.getString(0)
if (bfValue == ""){
bfValue=nowValue
}
else{
bfValue += ","+nowValue
}
buffer1.update(0,bfValue)
}
//得到缓冲区中存放的数据
override def evaluate(buffer: Row): Any = buffer.getString(0)
}
package com.henu
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
object HiveDemo2 {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val session = SparkSession.builder().master("local[4]").appName("hive").enableHiveSupport().getOrCreate()
session.udf.register("aggr",new NameUDAF())
session.sql("use spark")
//输出男女生人数,并输出对应的名字
val df = session.sql("select emp_gender,count(*) num,aggr(emp_name) names from employees group by emp_gender")
df.show()
session.stop()
}
}
+----------+---+-----------------+
|emp_gender|num| names|
+----------+---+-----------------+
| nv| 2| honey,kangkang|
| nan| 2|George,georgedage|
+----------+---+-----------------+
记得将hive-site.xml放到resources中如果搭建的是高可用集群,则需要
https://blog.csdn.net/qq_41946557/article/details/103457503