一、更方便的部署配置模式
本地环境:
spark版本2.4.5
scala2.11.8
carbon版本:2.4.3-bin-hadoop2.7
hadoop版本:2.7
hive版本:2.4.3
只需要增加一个conf 即可,不再需要配置carbon的storepath
spark-shell --conf spark.sql.extensions=org.apache.spark.sql.CarbonExtensions --jars /Users/hulb/opt/third/spark-2.4.3-bin-hadoop2.7/jars/apache-carbondata-2.0.1-bin-spark2.4.5-hadoop2.7.2.jar
二、启动后直接使用spark,不需要额外建立carbonSession
三、使用
scala> spark.sql("create database carbon201test")
res5: org.apache.spark.sql.DataFrame = []
scala> spark.sql("use carbon201test")
res6: org.apache.spark.sql.DataFrame = []
scala> spark.sql(
| s"""
| | CREATE TABLE IF NOT EXISTS carbon201_test_table(
| | id string,
| | name string,
| | city string,
| | age Int)
| | STORED AS carbondata
| """.stripMargin)
2020-06-04 14:04:25 AUDIT audit:74 - {"time":"2020年6月4日 下午02时04分25秒","username":"hulb","opName":"CREATE TABLE","opId":"82805921821073","opStatus":"START"}
2020-06-04 14:04:25 AUDIT audit:97 - {"time":"2020年6月4日 下午02时04分25秒","username":"hulb","opName":"CREATE TABLE","opId":"82805921821073","opStatus":"SUCCESS","opTime":"108 ms","table":"NA","extraInfo":{}}
res7: org.apache.spark.sql.DataFrame = []
scala> spark.sql("LOAD DATA INPATH '/user/hulb/test/sample.csv' INTO TABLE carbon201_test_table")
2020-06-04 14:04:37 AUDIT audit:74 - {"time":"2020年6月4日 下午02时04分37秒","username":"hulb","opName":"LOAD DATA","opId":"82817994090824","opStatus":"START"}
2020-06-04 14:04:38 ERROR AbstractDFSCarbonFile:453 - Exception occured: File hdfs://localhost:9000/user/hive/warehouse/carbon201test.db/_system does not exist.
java.io.FileNotFoundException: File hdfs://localhost:9000/user/hive/warehouse/carbon201test.db/_system does not exist.
at org.apache.hadoop.hdfs.DistributedFileSystem.listStatusInternal(DistributedFileSystem.java:795)
at org.apache.hadoop.hdfs.DistributedFileSystem.access$700(DistributedFileSystem.java:106)
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:853)
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:849)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.listStatus(DistributedFileSystem.java:860)
at org.apache.carbondata.core.datastore.filesystem.AbstractDFSCarbonFile.listFiles(AbstractDFSCarbonFile.java:451)
at org.apache.carbondata.core.datastore.filesystem.AbstractDFSCarbonFile.listFiles(AbstractDFSCarbonFile.java:554)
at org.apache.carbondata.core.view.MVProvider$SchemaProvider.retrieveAllSchemasInternal(MVProvider.java:481)
at org.apache.carbondata.core.view.MVProvider$SchemaProvider.checkAndReloadSchemas(MVProvider.java:536)
at org.apache.carbondata.core.view.MVProvider$SchemaProvider.retrieveAllSchemas(MVProvider.java:472)
at org.apache.carbondata.core.view.MVProvider.getSchemas(MVProvider.java:127)
at org.apache.carbondata.core.view.MVManager.getSchemas(MVManager.java:126)
at org.apache.carbondata.core.view.MVManager.getSchemas(MVManager.java:117)
at org.apache.carbondata.core.view.MVManager.getSchemasOnTable(MVManager.java:85)
at org.apache.carbondata.spark.rdd.CarbonDataRDDFactory$.updateTableStatus(CarbonDataRDDFactory.scala:1038)
at org.apache.carbondata.spark.rdd.CarbonDataRDDFactory$.loadCarbonData(CarbonDataRDDFactory.scala:613)
at org.apache.spark.sql.execution.command.management.CarbonLoadDataCommand.loadData(CarbonLoadDataCommand.scala:207)
at org.apache.spark.sql.execution.command.management.CarbonLoadDataCommand.processData(CarbonLoadDataCommand.scala:168)
at org.apache.spark.sql.execution.command.AtomicRunnableCommand$$anonfun$run$3.apply(package.scala:148)
at org.apache.spark.sql.execution.command.AtomicRunnableCommand$$anonfun$run$3.apply(package.scala:145)
at org.apache.spark.sql.execution.command.Auditable$class.runWithAudit(package.scala:104)
at org.apache.spark.sql.execution.command.AtomicRunnableCommand.runWithAudit(package.scala:141)
at org.apache.spark.sql.execution.command.AtomicRunnableCommand.run(package.scala:145)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:194)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:194)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
at org.apache.spark.sql.Dataset.(Dataset.scala:194)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:79)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at $line24.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:24)
at $line24.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:29)
at $line24.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:31)
at $line24.$read$$iw$$iw$$iw$$iw$$iw.(:33)
at $line24.$read$$iw$$iw$$iw$$iw.(:35)
at $line24.$read$$iw$$iw$$iw.(:37)
at $line24.$read$$iw$$iw.(:39)
at $line24.$read$$iw.(:41)
at $line24.$read.(:43)
at $line24.$read$.(:47)
at $line24.$read$.()
at $line24.$eval$.$print$lzycompute(:7)
at $line24.$eval$.$print(:6)
at $line24.$eval.$print()
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:793)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1054)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:645)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:644)
at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:644)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:576)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:572)
at scala.tools.nsc.interpreter.ILoop.interpretStartingWith(ILoop.scala:819)
at scala.tools.nsc.interpreter.ILoop.command(ILoop.scala:691)
at scala.tools.nsc.interpreter.ILoop.processLine(ILoop.scala:404)
at scala.tools.nsc.interpreter.ILoop.loop(ILoop.scala:425)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply$mcZ$sp(SparkILoop.scala:285)
at org.apache.spark.repl.SparkILoop.runClosure(SparkILoop.scala:159)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:182)
at org.apache.spark.repl.Main$.doMain(Main.scala:78)
at org.apache.spark.repl.Main$.main(Main.scala:58)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:849)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:167)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:195)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:924)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:933)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
2020-06-04 14:04:38 AUDIT audit:97 - {"time":"2020年6月4日 下午02时04分38秒","username":"hulb","opName":"LOAD DATA","opId":"82817994090824","opStatus":"SUCCESS","opTime":"492 ms","table":"carbon201test.carbon201_test_table","extraInfo":{"SegmentId":"0","DataSize":"1.11KB","IndexSize":"663.0B"}}
res8: org.apache.spark.sql.DataFrame = []
scala> spark.sql("LOAD DATA INPATH '/user/hulb/test/sample.csv' INTO TABLE carbon201_test_table")
2020-06-04 14:04:58 AUDIT audit:74 - {"time":"2020年6月4日 下午02时04分58秒","username":"hulb","opName":"LOAD DATA","opId":"82838494676198","opStatus":"START"}
2020-06-04 14:04:59 AUDIT audit:97 - {"time":"2020年6月4日 下午02时04分59秒","username":"hulb","opName":"LOAD DATA","opId":"82838494676198","opStatus":"SUCCESS","opTime":"829 ms","table":"carbon201test.carbon201_test_table","extraInfo":{"SegmentId":"1","DataSize":"1.11KB","IndexSize":"663.0B"}}
res9: org.apache.spark.sql.DataFrame = []