今天头儿临时起意让造一部分数据,其间有几个小问题,在此mark一下
package come.prince.spark.demo
import java.util.UUID
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.codehaus.jettison.json.JSONObject
import scala.io.Source
import scala.util.Random
/**
* 数组中是数字:random.nextInt(max-min+1)+min,例如,取5到10的随机数,random.nextInt(6)+5
* 数组中是字符串:random.nextInt(要取的位数)+起始位置
* Created by prince on 2018/5/29.
*/
case class Guo(name: String, age: Int, sex: String, level: Int, city: String, insurance: String, company: String)
object Cows {
Logger.getLogger("org").setLevel(Level.WARN)
private val sex = Array("男", "女")
private val insurance = Array("人寿险","医疗险","理财险","重疾险","车险","少儿险","意外险","健康险","工程险",
"船舶险","信用险","家财险","农险","货运险")
private val company = Array("人民人寿","太平洋保险","中国人寿","中国平安","新华保险","安邦保险","泰康人寿","中国太平",
"阳光保险", "大地保险","吉祥人寿","华夏人寿","信达产险","中英人寿","华安保险","蓝天保险","刚强保险","安民保险","个体户")
private val name = Source.fromFile("C:\\Users\\Administrator\\Desktop\\name.txt").getLines().toArray
private val city = Source.fromFile("C:\\Users\\Administrator\\Desktop\\city.txt").getLines().toArray
val random = new Random()
def getName: String = {
name(random.nextInt(750))
}
def getCity: String = {
city(random.nextInt(700))
}
def getSex: String = {
sex(random.nextInt(2))
}
def getAge: Int = {
random.nextInt(43)+18
}
def getLevel: Int = {
random.nextInt(20)+1
}
def getInsurance: String = {
insurance(random.nextInt(3)+0) + "," + insurance(random.nextInt(5)+4) + "," + insurance(random.nextInt(4)+10)
}
def getCompany: String = {
company(random.nextInt(19))
}
def main(args: Array[String]): Unit = {
while (true){
println(UUID.randomUUID() + "\t" + getName +"\t"+getAge+"\t"+getSex+"\t"+getLevel+"\t"+getCity+"\t"+getInsurance+"\t"+getCompany)
Thread.sleep(10)
}
// val spark = SparkSession.builder().master("local").getOrCreate()
// import spark.implicits._
//
// while (true){
// val df = Seq(Guo(getName, getAge, getSex, getLevel, getCity, getInsurance, getCompany)).toDF
//
// val prop = new java.util.Properties
// prop.setProperty("user", "root")
// prop.setProperty("password", "123456")
// df.write.mode(SaveMode.Append).jdbc("jdbc:mysql://192.168.1.97:3306/test?characterEncoding=utf8&useSSL=true", "Guo", prop)
//
// Thread.sleep(100)
// }
// spark.stop()
// while (true) {
// val event = new JSONObject()
// event
// .put("name", getName)
// .put("age", getAge)
// .put("sex", getSex)
// .put("level", getLevel)
// .put("city", getCity)
// .put("insurance", getInsurance)
// .put("company", getCompany)
//
// println(event.toString())
//
// Thread.sleep(100)
// }
}
}
刚开始,在写入mysql时遇到下面的问题:
java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
at com.mysql.jdbc.Util.getInstance(Util.java:387)
at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
... 15 more
18/05/29 18:53:27 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
at com.mysql.jdbc.Util.getInstance(Util.java:387)
at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
... 15 more
18/05/29 18:53:27 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
at com.mysql.jdbc.Util.getInstance(Util.java:387)
at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
... 15 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:245)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:900)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:900)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply$mcV$sp(Dataset.scala:2127)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2127)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546)
at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:2126)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:299)
at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:441)
at come.prince.spark.demo.Cows$.main(Cows.scala:79)
at come.prince.spark.demo.Cows.main(Cows.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
at com.mysql.jdbc.Util.getInstance(Util.java:387)
at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
... 15 more
原因是:MySQL 默认编码为 utf-8,占3个字节,一些表情或者非常见字符,比如该例子中 “\xE5\x85\xB3\xE8\x89\xAF” 占6个字节,插入失败。需要将MySQL 数据库中相对应的字段字符集格式改为 utf8mb4 即可。
另外 random 的用法也值得注意,后续会跟进相关的博客,具体分析用法。