造数据

今天头儿临时起意让造一部分数据,其间有几个小问题,在此mark一下

package come.prince.spark.demo

import java.util.UUID
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.codehaus.jettison.json.JSONObject
import scala.io.Source
import scala.util.Random
/**
  * 数组中是数字:random.nextInt(max-min+1)+min,例如,取5到10的随机数,random.nextInt(6)+5
  * 数组中是字符串:random.nextInt(要取的位数)+起始位置
  * Created by prince on 2018/5/29.
  */

case class Guo(name: String, age: Int, sex: String, level: Int, city: String, insurance: String, company: String)
object Cows {

  Logger.getLogger("org").setLevel(Level.WARN)

  private val sex = Array("男", "女")

  private val insurance = Array("人寿险","医疗险","理财险","重疾险","车险","少儿险","意外险","健康险","工程险",
    "船舶险","信用险","家财险","农险","货运险")

  private val company = Array("人民人寿","太平洋保险","中国人寿","中国平安","新华保险","安邦保险","泰康人寿","中国太平",
    "阳光保险", "大地保险","吉祥人寿","华夏人寿","信达产险","中英人寿","华安保险","蓝天保险","刚强保险","安民保险","个体户")

  private val name = Source.fromFile("C:\\Users\\Administrator\\Desktop\\name.txt").getLines().toArray
  private val city = Source.fromFile("C:\\Users\\Administrator\\Desktop\\city.txt").getLines().toArray

  val random = new Random()

  def getName: String = {
    name(random.nextInt(750))
  }

  def getCity: String = {
    city(random.nextInt(700))
  }

  def getSex: String = {
    sex(random.nextInt(2))
  }

  def getAge: Int = {
    random.nextInt(43)+18
  }

  def getLevel: Int = {
    random.nextInt(20)+1
  }

  def getInsurance: String = {
    insurance(random.nextInt(3)+0) + "," + insurance(random.nextInt(5)+4) + "," + insurance(random.nextInt(4)+10)
  }

  def getCompany: String = {
    company(random.nextInt(19))
  }

  def main(args: Array[String]): Unit = {

    while (true){
      println(UUID.randomUUID() + "\t" + getName +"\t"+getAge+"\t"+getSex+"\t"+getLevel+"\t"+getCity+"\t"+getInsurance+"\t"+getCompany)
      Thread.sleep(10)
    }

//    val spark = SparkSession.builder().master("local").getOrCreate()
//    import spark.implicits._
//
//    while (true){
//      val df = Seq(Guo(getName, getAge, getSex, getLevel, getCity, getInsurance, getCompany)).toDF
//
//      val prop = new java.util.Properties
//      prop.setProperty("user", "root")
//      prop.setProperty("password", "123456")
//      df.write.mode(SaveMode.Append).jdbc("jdbc:mysql://192.168.1.97:3306/test?characterEncoding=utf8&useSSL=true", "Guo", prop)
//
//      Thread.sleep(100)
//    }
//    spark.stop()


//    while (true) {
//      val event = new JSONObject()
//      event
//        .put("name", getName)
//        .put("age", getAge)
//        .put("sex", getSex)
//        .put("level", getLevel)
//        .put("city", getCity)
//        .put("insurance", getInsurance)
//        .put("company", getCompany)
//
//      println(event.toString())
//
//      Thread.sleep(100)
//    }
  }
}

刚开始,在写入mysql时遇到下面的问题:

java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
    at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
    at com.mysql.jdbc.Util.getInstance(Util.java:387)
    at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
    at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
    at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
    at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
    at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
    ... 15 more
18/05/29 18:53:27 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
    at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
    at com.mysql.jdbc.Util.getInstance(Util.java:387)
    at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
    at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
    at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
    at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
    at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
    ... 15 more

18/05/29 18:53:27 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
    at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
    at com.mysql.jdbc.Util.getInstance(Util.java:387)
    at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
    at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
    at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
    at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
    at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
    ... 15 more

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:245)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:902)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:900)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
    at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:900)
    at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply$mcV$sp(Dataset.scala:2127)
	at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2127)
    at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546)
	at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:2126)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:299)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:441)
	at come.prince.spark.demo.Cows$.main(Cows.scala:79)
	at come.prince.spark.demo.Cows.main(Cows.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.sql.BatchUpdateException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
	at com.mysql.jdbc.Util.getInstance(Util.java:387)
	at com.mysql.jdbc.SQLError.createBatchUpdateException(SQLError.java:1154)
	at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1773)
	at com.mysql.jdbc.PreparedStatement.executeBatchInternal(PreparedStatement.java:1257)
	at com.mysql.jdbc.StatementImpl.executeBatch(StatementImpl.java:959)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:227)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:300)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:299)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:902)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Incorrect string value: '\xE5\x85\xB3\xE8\x89\xAF' for column 'name' at row 1
    at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:957)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3878)
    at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3814)
    at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2478)
    at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2625)
    at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2551)
    at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:1861)
    at com.mysql.jdbc.PreparedStatement.executeUpdateInternal(PreparedStatement.java:2073)
    at com.mysql.jdbc.PreparedStatement.executeBatchSerially(PreparedStatement.java:1751)
    ... 15 more

原因是:MySQL 默认编码为 utf-8,占3个字节,一些表情或者非常见字符,比如该例子中 “\xE5\x85\xB3\xE8\x89\xAF” 占6个字节,插入失败。需要将MySQL 数据库中相对应的字段字符集格式改为 utf8mb4 即可。

另外 random 的用法也值得注意,后续会跟进相关的博客,具体分析用法。

你可能感兴趣的:(spark)