如果不进行foreachPartition,
该操作的并发度为1,你所有的数据都会在一个partition中进行操作,意味着无论你给的资源有多少,只有一个task会执行任务,执行效率可想而之,并且在稍微大点的表中进行操作分分钟就会OOM。
所以需要foreachPartition,这样的话就会有多个task,每个task一个线程的去处理该任务。
package com.tv.sohu.spark.streaming.dm.webp2p
import java.sql.{DriverManager, PreparedStatement, Connection}
import org.apache.spark.{SparkContext, SparkConf}
object RDDtoMysql {
case class Blog(name: String, count: Int)
def myFun(iterator: Iterator[(String, Int)]): Unit = {
var conn: Connection = null
var ps: PreparedStatement = null
val sql = "insert into blog(name, count) values (?, ?)"
try {
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "root", "123456"
)
iterator.foreach(data => {
ps = conn.prepareStatement(sql)
ps.setString(1, data._1)
ps.setInt(2, data._2)
ps.executeUpdate()
}
)
} catch {
case e: Exception => println("Mysql Exception")
} finally {
if (ps != null) {
}
if (conn != null) {
conn.close()
}
}
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("RDDToMysql").setMaster("local")
val sc = new SparkContext(conf)
val data = sc.parallelize(List(("www", 10), ("iteblog", 20), ("com", 30)))
data.foreachPartition(myFun)
}
}
其实是通过foreachPartition遍历RDD的每个分区,并调用普通的Scala方法来写数据库。在运行程序之前需要确保数据库里面存在blog表,可以通过下面语句创建:
|
然后直接运行上述的代码即可。运行完成你就可以在数据库里面查询结果:
|
实例1、
package com.tv.sohu.spark import java.sql.{Connection, DriverManager, PreparedStatement} import net.sf.json.JSONObject import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object SparkInsertIntoMysql { def main(args: Array[String]): Unit = { // var idate=args(0) // println(idate) val conf=new SparkConf() conf.setAppName("onlineNoUse").setMaster("local") val sc=new SparkContext(conf) val lines:RDD[String]=sc.textFile("hdfs://user/vetl/rawlog/onlineNoUse.txt") val res:RDD[String]=lines.map((line)=>{ val obj: JSONObject = JSONObject.fromObject(line) val date=obj.get("date") val uid=obj.get("uid") val reason=obj.get("reason") val sver=obj.get("sver") date+"\t"+uid+"\t"+reason+"\t"+sver }) res.foreachPartition(partitionProcess) } def partitionProcess(partition:Iterator[String]):Unit={ var conn: Connection = null var ps: PreparedStatement = null val sql = s"insert into user3_online_no_use(date,uid,reason,sver) values (?,?,?,?)" conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "user","password") ps = conn.prepareStatement(sql) try { partition.foreach(eachRecordProcess(_, ps)) ps.close() conn.close() }catch { case e:Exception =>e.printStackTrace() }finally { if (ps != null) { ps.close() } if (conn != null) { conn.close() } } } def eachRecordProcess(record:String,ps:PreparedStatement):Int={ val date=record.split("\t")(0) val uid=record.split("\t")(1) val reason=record.split("\t")(2) val sver=record.split("\t")(3) ps.setString(1, date) ps.setString(2, uid) ps.setString(3, reason) ps.setString(4, sver) ps.executeUpdate() } }