spark链接mysql数据库

如果不进行foreachPartition,该操作的并发度为1,你所有的数据都会在一个partition中进行操作,意味着无论你给的资源有多少,只有一个task会执行任务,执行效率可想而之,并且在稍微大点的表中进行操作分分钟就会OOM。

所以需要foreachPartition,这样的话就会有多个task,每个task一个线程的去处理该任务。

 

package com.tv.sohu.spark.streaming.dm.webp2p

import java.sql.{DriverManager, PreparedStatement, Connection}

import org.apache.spark.{SparkContext, SparkConf}

object RDDtoMysql {

  case class Blog(name: String, count: Int)

  def myFun(iterator: Iterator[(String, Int)]): Unit = {
            var conn: Connection = null
            var ps: PreparedStatement = null
            val sql = "insert into blog(name, count) values (?, ?)"
            try {
                  conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "root", "123456"
      )
                  iterator.foreach(data => {
                        ps = conn.prepareStatement(sql)
                        ps.setString(1, data._1)
                        ps.setInt(2, data._2)
                        ps.executeUpdate()
               
      }
                 )
    } catch {
                  case e: Exception => println("Mysql Exception")
    } finally {
                  if (ps != null) {

      }
                  if (conn != null) {
                        conn.close()
      }     
    } 
  }

  def main(args: Array[String]) {
            val conf = new SparkConf().setAppName("RDDToMysql").setMaster("local")
            val sc = new SparkContext(conf)
            val data = sc.parallelize(List(("www", 10), ("iteblog", 20), ("com", 30)))
            data.foreachPartition(myFun)
       
  }

}

 

  其实是通过foreachPartition遍历RDD的每个分区,并调用普通的Scala方法来写数据库。在运行程序之前需要确保数据库里面存在blog表,可以通过下面语句创建:

CREATE TABLE `blog` (

  `name` varchar(255) NOT NULL,

  `count` int(10) unsigned DEFAULT NULL

) ENGINE=InnoDB DEFAULT CHARSET=utf-8

  然后直接运行上述的代码即可。运行完成你就可以在数据库里面查询结果:

SELECT * FROM blog b;

www  10

iteblog  20

com  30

实例1、

package com.tv.sohu.spark

import java.sql.{Connection, DriverManager, PreparedStatement}

import net.sf.json.JSONObject
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object SparkInsertIntoMysql {

  def main(args: Array[String]): Unit = {
//    var idate=args(0)
//    println(idate)
    val conf=new SparkConf()
    conf.setAppName("onlineNoUse").setMaster("local")
    val sc=new SparkContext(conf)
    val lines:RDD[String]=sc.textFile("hdfs://user/vetl/rawlog/onlineNoUse.txt")
    val res:RDD[String]=lines.map((line)=>{
      val obj: JSONObject = JSONObject.fromObject(line)
      val date=obj.get("date")
      val uid=obj.get("uid")
      val reason=obj.get("reason")
      val sver=obj.get("sver")
      date+"\t"+uid+"\t"+reason+"\t"+sver
    })
    res.foreachPartition(partitionProcess)
  }

  def partitionProcess(partition:Iterator[String]):Unit={
    var conn: Connection = null
    var ps: PreparedStatement = null
    val sql = s"insert into user3_online_no_use(date,uid,reason,sver) values (?,?,?,?)"
    conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark",
        "user","password")
    ps = conn.prepareStatement(sql)
    try {
      partition.foreach(eachRecordProcess(_, ps))
      ps.close()
      conn.close()
    }catch {
      case e:Exception =>e.printStackTrace()
    }finally {
      if (ps != null) {
        ps.close()
      }
      if (conn != null) {
        conn.close()
      }
    }
  }

  def eachRecordProcess(record:String,ps:PreparedStatement):Int={
    val date=record.split("\t")(0)
    val uid=record.split("\t")(1)
    val reason=record.split("\t")(2)
    val sver=record.split("\t")(3)
    ps.setString(1, date)
    ps.setString(2, uid)
    ps.setString(3, reason)
    ps.setString(4, sver)
    ps.executeUpdate()
  }

}

你可能感兴趣的:(spark)