一、创建测试表t_user2、user_t和t_result
1、t_user2表结构如下:
CREATE TABLE `t_user2` (
`id` int(11) DEFAULT NULL COMMENT 'id',
`name` varchar(64) DEFAULT NULL COMMENT '用户名',
`password` varchar(64) DEFAULT NULL COMMENT '密码',
`age` int(11) DEFAULT NULL COMMENT '年龄'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、user_t表结构如下:
CREATE TABLE `user_t` (
`id` int(11) DEFAULT NULL COMMENT 'id',
`name` varchar(64) DEFAULT NULL COMMENT '姓名',
`password` varchar(64) DEFAULT NULL COMMENT '密码',
`address` varchar(64) DEFAULT NULL COMMENT '地址',
`age` int(11) DEFAULT NULL COMMENT '年龄'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
3、t_result表结构如下:
CREATE TABLE `t_result` (
`id` int(11) DEFAULT NULL COMMENT 'id',
`name` varchar(64) DEFAULT NULL COMMENT '姓名',
`password` varchar(64) DEFAULT NULL COMMENT '密码',
`address` varchar(64) DEFAULT NULL COMMENT '地址',
`age` int(11) DEFAULT NULL COMMENT '年龄'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
4、插入测试数据:
INSERT INTO `t_user2` VALUES (12, 'cassie', '1234562', 25);
INSERT INTO `t_user2` VALUES (11, 'zhangs', '123456', 25);
INSERT INTO `t_user2` VALUES (23, 'zhangs', '2321312', 34);
INSERT INTO `t_user2` VALUES (22, 'tom', 'sadfdsa', 23);
INSERT INTO `user_t` VALUES (1, 'zhangs', '123456', NULL, 25);
INSERT INTO `user_t` VALUES (2, 'zhangs', '123456', NULL, 252);
二、创建maven工程,导入mysql驱动包、spark相关包
mysql-connector-java.5.1.24.jar
spark-assembly-1.6.2-hadoop2.6.0.jar
spark-examples-1.6.2-hadoop2.6.0.jar
注:pom.xml文件内容如下
4.0.0
scala-test
com.scala.mytest
1.0-SNAPSHOT
UTF-8
UTF-8
1.7
cdh5.8.0
1.7
1.7
UTF-8
2.10
2.10.4
cloudera
https://repository.cloudera.com/artifactory/cloudera-repos/
mysql
mysql-connector-java
5.1.24
com.google.code.gson
gson
2.7
junit
junit
4.12
org.apache.kafka
kafka-clients
0.9.0.0
org.apache.kafka
kafka-streams
0.10.0.0
org.apache.kafka
kafka_2.11
0.9.0.1
jmxtools
com.sun.jdmk
jmxri
com.sun.jmx
jms
javax.jms
org.apache.hadoop
hadoop-client
2.6.0
org.apache.hadoop
hadoop-common
2.6.0
org.apache.hadoop
hadoop-hdfs
2.6.0
org.scala-lang
scala-library
${scala.version}
com.fasterxml.jackson.core
jackson-databind
2.4.4
org.slf4j
slf4j-log4j12
1.7.20
commons-beanutils
commons-beanutils
1.7.0
commons-collections
commons-collections
3.1
commons-lang
commons-lang
2.5
commons-logging
commons-logging
1.1.3
net.sf.ezmorph
ezmorph
1.0.6
net.sf.json-lib
json-lib
2.4
jdk15
4.0.0
scala-test
com.scala.mytest
1.0-SNAPSHOT
UTF-8
UTF-8
1.7
cdh5.8.0
1.7
1.7
UTF-8
2.10
2.10.4
cloudera
https://repository.cloudera.com/artifactory/cloudera-repos/
mysql
mysql-connector-java
5.1.24
com.google.code.gson
gson
2.7
junit
junit
4.12
org.apache.kafka
kafka-clients
0.9.0.0
org.apache.kafka
kafka-streams
0.10.0.0
org.apache.kafka
kafka_2.11
0.9.0.1
jmxtools
com.sun.jdmk
jmxri
com.sun.jmx
jms
javax.jms
org.apache.hadoop
hadoop-client
2.6.0
org.apache.hadoop
hadoop-common
2.6.0
org.apache.hadoop
hadoop-hdfs
2.6.0
org.scala-lang
scala-library
${scala.version}
com.fasterxml.jackson.core
jackson-databind
2.4.4
org.slf4j
slf4j-log4j12
1.7.20
commons-beanutils
commons-beanutils
1.7.0
commons-collections
commons-collections
3.1
commons-lang
commons-lang
2.5
commons-logging
commons-logging
1.1.3
net.sf.ezmorph
ezmorph
1.0.6
net.sf.json-lib
json-lib
2.4
jdk15
三、创建本地执行的scala代码类:
SparkSqlMysqlDatasource.scala
package sql
import java.util.Properties
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 生产环境:下提交任务
* spark-submit --class sql.SparkSqlMysqlDatasource --master yarn-cluster --executor-memory 2G --num-executors 2 --driver-memory 1g --executor-cores 1 /data1/e_heyutao/sparktest/sparkEnn.jar
*
*/
object SparkSqlMysqlDatasource {
//数据库配置
lazy val url = "jdbc:mysql://your_ip:3306/my_test"
lazy val username = "root"
lazy val password = "secret_password"
def main(args: Array[String]) {
// val sparkConf = new SparkConf().setAppName("sparkSqlTest").setMaster("local[2]").set("spark.app.id", "sql")
val sparkConf = new SparkConf().setAppName("sparkSqlTest").setMaster("yarn-cluster").set("spark.app.id", "sqlTest")
//序列化
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.set("spark.kryoserializer.buffer", "256m")
sparkConf.set("spark.kryoserializer.buffer.max", "2046m")
sparkConf.set("spark.akka.frameSize", "500")
sparkConf.set("spark.rpc.askTimeout", "30")
//获取context
val sc = new SparkContext(sparkConf)
//获取sqlContext
val sqlContext = new SQLContext(sc)
//引入隐式转换,可以使用spark sql内置函数
import sqlContext.implicits._
//创建jdbc连接信息
val uri = url + "?user=" + username + "&password=" + password + "&useUnicode=true&characterEncoding=UTF-8"
val prop = new Properties()
//注意:集群上运行时,一定要添加这句话,否则会报找不到mysql驱动的错误
prop.put("driver", "com.mysql.jdbc.Driver")
//加载mysql数据表
val df_test1: DataFrame = sqlContext.read.jdbc(uri, "user_t", prop)
val df_test2: DataFrame = sqlContext.read.jdbc(uri, "t_user2", prop)
//从dataframe中获取所需字段
df_test2.select("id", "name", "age").collect()
.foreach(row => {
println("id " + row(0) + " ,name " + row(1) + ", age " + row(2))
})
//注册成临时表
df_test1.registerTempTable("temp_table")
val total_sql = "select * from temp_table "
val total_df: DataFrame = sqlContext.sql(total_sql)
//将结果写入数据库中
val properties=new Properties()
properties.setProperty("user","root")
properties.setProperty("password","secret_password")
total_df.write.mode("append").jdbc("jdbc:mysql://your_ip:3306/my_test?useUnicode=true&characterEncoding=UTF-8","t_result",properties)
/**
* 注意:查看源码可以知道详细意思
def mode(saveMode: String): DataFrameWriter = {
this.mode = saveMode.toLowerCase match {
case "overwrite" => SaveMode.Overwrite
case "append" => SaveMode.Append
case "ignore" => SaveMode.Ignore
case "error" | "default" => SaveMode.ErrorIfExists
case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
"Accepted modes are 'overwrite', 'append', 'ignore', 'error'.")
}
*/
//分组后求平均值
total_df.groupBy("name").avg("age").collect().foreach(x => {
println("name " + x(0))
println("age " + x(1))
})
}
}
结果:
id 12 ,name cassie, age 25
id 11 ,name zhangs, age 25
id 23 ,name zhangs, age 34
id 22 ,name tom, age 23
name zhangs
age 138.5
四、查看数据库表t_result,发现刚才从mysql中读取出来的数据已经插入到表中
以上内容已经测试通过,可根据需要修改,如有疑问请留言!谢谢