前言:
使用spark去读取JDBC的数据,有三种读取方式,如果数据量特别大的时候,达到了亿级的数据量,最好使用分区的方式读取.否则容易出现OOM异常.
文末再附pom依赖!
1.jdbc源码参数
url:表示jdbc连接
table:表示读取的表名
properties:表示配置信息
def jdbc(url: String, table: String, properties: Properties):
2.代码
class demo03 {
//创建SparkSession入口
private val spark: SparkSession = SparkSession.builder().appName("mysql").master("local[4]").getOrCreate()
//导入隐士转换
import spark.implicits._
//设置连接JDBC的各种配置内容
val url = "jdbc:mysql://node03:3306/spark01"
val table = "student"
val user = "root"
val password = "123"
private val prop = new Properties()
prop.setProperty("user",user)
prop.setProperty("password",password)
@Test
//第一种读取方式
def first = {
spark.read
//指定读取格式的方式
.format("jdbc")
.jdbc(url,table,prop)
.show()
}
}
1.源码
def jdbc(
url: String,
table: String,
predicates: Array[String],
connectionProperties:
解释:
* @param url JDBC database url of the form `jdbc:subprotocol:subname`
* @param table Name of the table in the external database.
* @param predicates Condition in the where clause for each partition.
* @param connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property
* should be included. "fetchsize" can be used to control the
* number of rows per fetch.
2.代码
class demo03 {
//创建SparkSession入口
private val spark: SparkSession = SparkSession.builder().appName("mysql").master("local[4]").getOrCreate()
//导入隐士转换
import spark.implicits._
//设置连接JDBC的各种配置内容
val url = "jdbc:mysql://node03:3306/spark01"
val table = "student"
val user = "root"
val password = "123"
private val prop = new Properties()
prop.setProperty("user",user)
prop.setProperty("password",password)
@Test
//第二种方式读取(使用任意类型的列作为分区)
def second = {
//1.创建predicates,指定age字段和范围作为分区
val predicates = Array(
"age < 30",
"age >= 30 , age < 60 ",
"age >= 60"
)
spark.read
.format("jdbc")
.jdbc(url,table,predicates,prop)
.show()
}
}
1.源码
def jdbc(
url: String,
table: String,
columnName: String,
lowerBound: Long,
upperBound: Long,
numPartitions: Int,
connectionProperties: Properties):
解释:
* @param url JDBC database url of the form `jdbc:subprotocol:subname`.
* @param table Name of the table in the external database.
* @param columnName the name of a column of integral type that will be used for partitioning.
* @param lowerBound the minimum value of `columnName` used to decide partition stride.
* @param upperBound the maximum value of `columnName` used to decide partition stride.
* @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive),
* `upperBound` (exclusive), form partition strides for generated WHERE
* clause expressions used to split the column `columnName` evenly. When
* the input is less than 1, the number is set to 1.
* @param connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property
* should be included. "fetchsize" can be used to control the
* number of rows per fetch.
2.代码
class demo03 {
//创建SparkSession入口
private val spark: SparkSession = SparkSession.builder().appName("mysql").master("local[4]").getOrCreate()
//导入隐式转换
import spark.implicits._
//设置连接JDBC的各种配置内容
val url = "jdbc:mysql://node03:3306/spark01"
val table = "student"
val user = "root"
val password = "123"
private val prop = new Properties()
prop.setProperty("user",user)
prop.setProperty("password",password)
@Test
//第三种方式(动态指定分区)
def third = {
//查询出age的最大最小值,动态确定分区的最小值和最大值,避免直接写死
val array: Array[Row] = spark.read.jdbc(url,"(select min(age) as minAge ,max(age) as maxAge from student) as t1",prop).collect()
//转换格式,因为在读取的时候column必须是整数列
val min = array.head.getAs[Int]("minAge")
val max = array.head.getAs[Int]("maxAge")
val df: DataFrame = spark.read
.format("jdbc")
.jdbc(url, table, "age", min, max, 10, prop)
//打印查询数据
df.show()
//打印分区数
println("分区数:"+df.rdd.partitions.size)
}
}
1.根据指定的最小值与最大值来规划数据所处的分区
val stride: Long = upperBound / numPartitions - lowerBound / numPartitions
val column = partitioning.column
var i: Int = 0
var currentValue: Long = lowerBound
var ans = new ArrayBufferPartition
while (i < numPartitions) {
val lBound = if (i != 0) s"$column >= c u r r e n t V a l u e " e l s e n u l l c u r r e n t V a l u e + = s t r i d e v a l u B o u n d = i f ( i ! = n u m P a r t i t i o n s − 1 ) s " currentValue" else null currentValue += stride val uBound = if (i != numPartitions - 1) s" currentValue"elsenullcurrentValue+=stridevaluBound=if(i!=numPartitions−1)s"column < KaTeX parse error: Expected '}', got 'EOF' at end of input: … s"uBound or KaTeX parse error: Expected 'EOF', got '}' at position 29: …l" }̲ else { …lBound AND $uBound"
}
ans += JDBCPartition(whereClause, i)
i = i + 1
}
数据分区过程:
stride = 60/3 - 1/3 = 20
column = “age”
currentValue=1
0<3
lBound = null
currentValue = currentValue + stride = 21
uBound = “age<21”
whereClause = “age<21 or age is null”
1<3
lBound = “age>=21”
currentValue = currentValue+stride = 21+20 = 41
uBound = “age<41”
whereClause = “age>=21 and age<41”
2<3
lBound = “age>=41”
currentValue = 41+20 = 61
uBound = null
whereClause=“age>=41”
object mysql {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[4]").appName("mysql").getOrCreate()
import spark.implicits._
//是数据的约束,字段必须和mysql中创建的表一致
val schema = StructType(List(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("gpa", FloatType)
))
//1.读取数据
val df = spark.read
.schema(schema)
.option("sep", "\t")
.csv("dataset/student")
//2.操作数据
val source: Dataset[Row] = df.where('age > 20).where('age <30)
//3.写数据--连接mysql
source.write
.format("jdbc")
.mode("overwrite")
.option("url","jdbc:mysql://node03:3306/spark01")
.option("dbtable","student")
.option("user","root")
.option("password","123")
.save()
}
}
注意:
本地运行程序需要特别添加mysql的依赖
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA