测试数据:
people.json
{"name":"andy"}
{"name":"len"}
{"name":"marry","age":19}
{"name":"tom","age":29}
{"name":"mike","age":39}
people.txt
1,tang1,10
2,tang2,20
3,tang3,30
4,tang4,40
5,tang5,50
package bi.tag
import java.util.Properties
import bi.utils.{ConfigUtils, KoboldAppUtil, StringUtil}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.functions.{count, sum}
import org.apache.spark.sql._
import org.slf4j.LoggerFactory
/*
* Created by tang on 2019-08-07
*/
object TestSparkSql {
Logger.getLogger("org").setLevel(Level.WARN)
val logger = LoggerFactory.getLogger(TestSparkSql.getClass.getSimpleName)
val startTime: String = "2019-07-15"
val endTime: String = "2019-07-21"
println(s"开始时间: ${startTime}")
println(s"结束时间: ${endTime}")
def main(args: Array[String]): Unit = {
val spark = KoboldAppUtil.createSparkContext(TestSparkSql.getClass.getSimpleName)
//返回基础sparkContext,用于创建RDD以及管理群集资源
val sc = spark.sparkContext
println("---数据处理开始---")
testJDBC(spark)
// testLocalFile(spark)
// testLocalJsonFile(spark)
// testHive(spark)
println("---数据处理结束---")
spark.close()
}
/**
* 数据源:JDBC
* @param spark
*/
def testJDBC(spark: SparkSession): Unit = {
// 从机器1的mysql读取数据
// val spark = SparkSession.builder().appName("MysqlQueryDemo").master("local").getOrCreate()//ye ke yi
val jdbcDF = spark.read
.format("jdbc")
.option("driver", "com.mysql.jdbc.Driver")
.option("url", "jdbc:mysql://111.111.111.111:3306/ln?useUnicode=true&characterEncoding=utf-8")
.option("dbtable", "S_T_GROUP")
.option("user", "ree")
.option("password", "ajdcy3ZDinqd")
.load()
jdbcDF.show()
jdbcDF.createTempView("t_tmp_group")
val results = spark.sql("SELECT GROUP_NAME FROM t_tmp_group")
results.show()
//
// val tmpfiled2 = spark.sql(
// """
// |select brand,regionname,branchname,workno,employeename,entrydate,historyoffercount from t_cal_tmp1
// """.stripMargin //https://blog.csdn.net/weixin_38750084/article/details/99643789
// )
//推送到机器2的mysql,注意这里的user需要有truncate的权限才可以使用results.write.mode(SaveMode.Overwrite)模式,因为Overwrite底层用的就是truncate
logger.info("--------------推送到机器2的mysql----------------")
val prop = new Properties()
prop.setProperty("user", ConfigUtils.mysqlUser)
prop.setProperty("password", ConfigUtils.mysqlPw)
prop.setProperty("driver", ConfigUtils.mysqlDriver)
results.write.mode(SaveMode.Append).jdbc(ConfigUtils.mysqlUrl, "sparksql_hive_test", prop)
}
/**
* 数据源:读取linux本地txt文件
*
* @param spark
*/
def testLocalFile(spark: SparkSession): Unit = {
import org.apache.spark.sql.types._
// Create an RDD
val peopleRDD = spark.sparkContext.textFile("file:///var/lb/hadfs/spde_sop_bj/bi_table/tang/qt/people.txt")
// The schema is encoded in a string
val schemaString = "name age"
// Generate the schema based on the string of schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
// Convert records of the RDD (people) to Rows
val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim))
// Apply the schema to the RDD
val peopleDF = spark.createDataFrame(rowRDD, schema)
// Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")
// SQL can be run over a temporary view created using DataFrames
val results = spark.sql("SELECT name,age FROM people") //results 只能get(0),只有一列值
// val results = spark.sql("SELECT name FROM people")//results 只能get(0),只有一列值
// The results of SQL queries are DataFrames and support all the normal RDD operations
// The columns of a row in the result can be accessed by field index or by field name
// results.map(attributes => "Name: " + attributes(0)).show()
results.show()
results.filter(row => row.get(0) == "tang").show()
results.filter(row => Integer.valueOf(row.get(1).toString) >= 20).show()
}
/**
* 数据源:读取linux本地json文件(当然也可替换为hdfs文件路径)
*
* @param spark
*/
// {"name":"Michael"}
// {"name":"Andy", "age":30}
// {"name":"Justin", "age":19}
// {"name":"tang", "age":27}
// {"name":"zha", "age":27}
def testLocalJsonFile(spark: SparkSession): Unit = {
val df = spark.read.json("file:///var/lb/hadoop-hdfs/spride_sp_bj/bi_table/tang/qt/people.json")
import spark.implicits._
// Displays the content of the DataFrame to stdout
df.show()
df.select("name").show()
df.select("name", "age").show()
df.filter($"age" > 21).show()
df.groupBy("age").count().show()
// Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")
val sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()
}
/**
* 数据源:Hive
* @param spark
*/
def testHive(spark: SparkSession): Unit = {
val tmpfiled1 = spark.sql(
s"""
|SELECT
| case
| when instr(b.BranchNo,'AIN')>0
| THEN 'AIEN'
| WHEN instr(b.BranchNo,'WRE')>0
| THEN 'WRE'
| WHEN instr(b.BranchNo,'AE')>0
| THEN 'AE'
| ELSE '无'
| END brand,
| d.Name RegionName,
| c.Name BranchName,
| b.WorkNo,
| t1.EmployeeName,
| b.EntryDate,
| e.fillindate lastOfferDate,
| f.count historyOfferCount
|FROM
| (
| SELECT
| employeeno,
| employeename
| FROM
| ods.ods_ain_KPI_RecomRecords
| WHERE
| FillinDate>='${startTime}'
| AND FillinDate<='${endTime}'
| AND PhaseId=10
| AND Role='顾'
| AND IsApprove=1
| AND IsCancel IS NULL
| GROUP BY
| employeeno,
| employeename )t1
|LEFT JOIN
| ods.ods_ain_base_employees b
|ON
| t1.EmployeeNo=b.Number
|LEFT JOIN
| ods.ods_ain_base_branchs c
|ON
| b.BranchNo=c.Number
|LEFT JOIN
| ods.ods_ain_base_regions d
|LEFT JOIN
| (
| SELECT
| EmployeeNo,
| collect_list(FillinDate)[size(collect_list(FillinDate))-1] FillinDate
| FROM
| ods.ods_aen_KPI_RecomRecords
| WHERE
| FillinDate>='${startTime}'
| AND FillinDate<='${endTime}'
| AND PhaseId=10
| AND IsApprove=1
| AND IsCancel IS NULL
| GROUP BY
| EmployeeNo) e
|ON
| t1.EmployeeNo=e.EmployeeNo
|LEFT JOIN
| (
| SELECT
| EmployeeNo,
| COUNT(*) COUNT
| FROM
| ods.ods_aien_KPI_RecomRecords
| WHERE
| PhaseId=10
| AND IsApprove=1
| AND IsCancel IS NULL
| GROUP BY
| EmployeeNo) f
|ON
| t1.EmployeeNo=f.EmployeeNo
|WHERE
| d.ManageBranchNos LIKE concat(concat("%{",b.branchno),"}%")
|ORDER BY
| BranchName
|
| """.stripMargin
).createTempView("t_cal_tmp1")
val tmpfiled2 = spark.sql(
"""
|select brand,regionname,branchname,workno,employeename,entrydate,historyoffercount from t_cal_tmp1
""".stripMargin
)
//创建临时表
tmpfiled2.createTempView("t_cal_tmp2")
logger.info("--------------注册临时表 t_cal_tmp2----------------")
//写入hive Overwrite 覆盖 Append 追加
logger.info("--------------写入hive:tmp.sparksql_hive_test----------------")
tmpfiled2.write.mode(SaveMode.Overwrite).saveAsTable("tmp.sparksql_hive_test")
//推送到mysql
logger.info("--------------推送到mysql:sparksql_hive_test1----------------")
val prop = new Properties()
prop.setProperty("user", ConfigUtils.mysqlUser)
prop.setProperty("password", ConfigUtils.mysqlPw)
prop.setProperty("driver", ConfigUtils.mysqlDriver)
tmpfiled2.write.mode(SaveMode.Append).jdbc(ConfigUtils.mysqlUrl, "sparksql_hive_test1", prop)
val employees = spark.sql(
"""
|select * from ods.ods_ain_base_employees
""".stripMargin
)
logger.info("--------------DataFrame的使用:groupBy/agg/selectExpr--------------")
/**
* groupby为对原DataFrame进行打包分组,agg为聚合(其操作包括max、min、std、sum、count)。
* selectExpr:可以对指定字段进行特殊处理
*/
val achieve_test = spark.sql(
"""
|select * from tmp.achieve_test
""".stripMargin
)
/**
* selectExpr:可以对指定字段进行特殊处理
* 可以直接对指定字段调用UDF函数,或者指定别名等。传入String类型参数,得到DataFrame对象。
* 参考:https://www.cnblogs.com/rxingyue/p/7113235.html
*/
//相当于: select employeename,count(effectivedate) count,sum(amount) amount1 from tmp.achieve_test group by employeename
val res: DataFrame = achieve_test
.groupBy("workno", "employeename")
.agg(count("effectivedate") as "count", sum("amount") as "amount1")
.selectExpr("workno", "employeename", "count", "amount1") //展示结果字段
.where(" employeename='晋平'")
// .filter("")
res.show();
logger.info("--------------DataFrame:join--------------")
用于包含RDD到DataFrame隐式转换操作
import spark.implicits._
val dataset: Dataset[Row] = achieve_test.join(employees, $"employeename" === $"fullname", "left").where("employeename ='王兆月'")
//输出全部数据
dataset.show();
//输出某一列
logger.info("--------------输出列employeename--------------")
dataset.select("employeename").show();
dataset.select(dataset.col("employeename")).show();
logger.info("--------------输出amount>980--------------")
//判断amount>980
dataset.select(dataset.col("amount").gt(980)).show();
//判断amount>993
dataset.select(dataset.col("amount").geq(993)).show();
//将age += 100
logger.info("--------------将age += 100--------------")
dataset.select(dataset.col("amount").plus(100)).show();
//输出元数据,打印数据树形结构
dataset.printSchema();
logger.info("--------------直接注册udf并使用--------------")
spark.udf.register("convertkey", (office: String, team: String) => {
var key =""
try {
key = office + "_" + team
}catch {
case e:Exception => e.printStackTrace()
}
key //最后一行要将结果返回
})
val convertkey = spark.sql(
"""
|select convertkey(office,team) from tmp.achieve_test
""".stripMargin
)
//convertkey.createTempView("t_convertkey")
convertkey.sho
}
}