spark.read.jdbc 并发设置

1. 通过predicates设置读取并行度,如果只是spark.read.jdbc(mySqlHelper.url,mysql_table,predicates,mySqlHelper.prop),则并行的是1.

    val ip = ""
    val user = ""
    val database = ""
    val password = ""
    val mySqlHelper = MySqlHelper(ip,database,user,password)
    def main(args: Array[String]): Unit = {
        val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").enableHiveSupport().getOrCreate()
     
        val mysql_table = ""

        def f1(): Unit ={
            val arr = ArrayBuffer[Int]()
            for(i <- 0 until 100){
                arr.append(i)
            }
            val predicates =arr.map(i=>{s"SHA1(fieldName)%100 = $i"}).toArray
            val starttime = System.currentTimeMillis()
            val a = spark.read.jdbc(mySqlHelper.url,mysql_table,predicates,mySqlHelper.prop)
            println(a.rdd.getNumPartitions)
            println(a.count())
            //a.show(false)
            val endtime = System.currentTimeMillis()
            println(endtime-starttime)
        }

        def f2(): Unit ={
            val starttime = System.currentTimeMillis()
            val a = spark.read.jdbc(mySqlHelper.url,mysql_table,mySqlHelper.prop)
            println(a.rdd.getNumPartitions)
            a.show(false)
            //        println(a.count())
            val endtime = System.currentTimeMillis()
            println(endtime-starttime) //
        }
        spark.stop()
    }

2. 测试结果:

   数据量在1百万级别 两者读取速度没有明显的差别,在千万级别f1明显快的多的多

你可能感兴趣的:(spark)