本地开发环境:WIN10、IDEA2019.3、Scala2.11.12、Spark2.4.0
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<repositories>
<repository>
<id>maven-aliid>
<url>http://maven.aliyun.com/nexus/content/groups/public//url>
<releases>
<enabled>trueenabled>
releases>
<snapshots>
<enabled>trueenabled>
<updatePolicy>alwaysupdatePolicy>
<checksumPolicy>failchecksumPolicy>
snapshots>
repository>
<repository>
<id>clouderaid>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/url>
repository>
<repository>
<id>cloudera.public.repoid>
<url>https://repository.cloudera.com/artifactory/publicurl>
repository>
repositories>
<groupId>org.examplegroupId>
<artifactId>UploadFileScalaartifactId>
<version>1.0-SNAPSHOTversion>
<properties>
<log4j.version>1.2.17log4j.version>
<slf4j.version>1.7.22slf4j.version>
<casbah.version>3.1.1casbah.version>
<redis.version>2.9.0redis.version>
<spark.version>2.4.0spark.version>
<jblas.version>1.2.1jblas.version>
<pg.version>42.2.5pg.version>
<scala.version>2.11.12scala.version>
properties>
<dependencies>
<dependency>
<groupId>ru.yandex.clickhousegroupId>
<artifactId>clickhouse-jdbcartifactId>
<version>0.2version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.coregroupId>
<artifactId>*artifactId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>jcl-over-slf4jartifactId>
<version>${slf4j.version}version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-apiartifactId>
<version>${slf4j.version}version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>${slf4j.version}version>
dependency>
<dependency>
<groupId>log4jgroupId>
<artifactId>log4jartifactId>
<version>${log4j.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-mllib_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.scala-langgroupId>
<artifactId>scala-libraryartifactId>
<version>${scala.version}version>
dependency>
<dependency>
<groupId>org.postgresqlgroupId>
<artifactId>postgresqlartifactId>
<version>${pg.version}version>
dependency>
<dependency>
<groupId>com.databricksgroupId>
<artifactId>spark-csv_2.10artifactId>
<version>1.5.0version>
dependency>
<dependency>
<groupId>net.jpountz.lz4groupId>
<artifactId>lz4artifactId>
<version>1.3.0version>
dependency>
<dependency>
<groupId>ru.yandex.clickhousegroupId>
<artifactId>clickhouse-jdbcartifactId>
<version>0.2version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.coregroupId>
<artifactId>*artifactId>
exclusion>
exclusions>
dependency>
dependencies>
<build>
<resources>
<resource>
<directory>src/main/scaladirectory>
<includes>
<include>**/*.propertiesinclude>
<include>**/*.xmlinclude>
includes>
<filtering>truefiltering>
resource>
<resource>
<directory>src/main/resourcesdirectory>
resource>
<resource>
<directory>src/main/resources/libdirectory>
<targetPath>BOOT-INF/lib/targetPath>
<includes>
<include>**/*.jarinclude>
includes>
resource>
resources>
<sourceDirectory>src/main/scalasourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.mavengroupId>
<artifactId>scala-maven-pluginartifactId>
<version>3.2.2version>
<executions>
<execution>
<goals>
<goal>compilegoal>
<goal>testCompilegoal>
goals>
execution>
executions>
<configuration>
<scalaVersion>${scala.version}scalaVersion>
<args>
<arg>-target:jvm-1.8arg>
args>
<jvmArgs>
<jvmArg>-Xss4096kjvmArg>
jvmArgs>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-assembly-pluginartifactId>
<version>3.0.0version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependenciesdescriptorRef>
descriptorRefs>
configuration>
<executions>
<execution>
<id>make-assemblyid>
<phase>packagephase>
<goals>
<goal>singlegoal>
goals>
execution>
executions>
plugin>
plugins>
build>
project>
我们上传的CSV文件数据是供于临时机器学习数据,故导入临时表
CREATE DATABASE temp;
CREATE TABLE temp.user_Detail_20200416 (
user_id Int32
,sale_amount Float32
,trans_count Int32
,offline_count Int32
,online_count Int32
,shopping_count Int32
,tuihuo_count Int32
,tuihuo_lv Float32
,apru Float32
,create_day Int32
,is_gravida Int32
,is_dobule_source Int32
,baby_day Int32
,active_code Int32
) ENGINE = MergeTree() ORDER BY (user_id) SETTINGS index_granularity = 8192;
该类是工具类主类DBUtils
object DBUtils {
val DEV = "dev"
val PROD = "prod"
var concurrentMode = ""
private def DBUtils(mode: String): Unit = {
concurrentMode = mode
}
private def getDBProperties(mode: String) = {
val dbProperties = new Properties()
if (mode == null) {
throw new IllegalArgumentException("需要正确的参数 mode 属性")
}
if (DEV.equals(mode)) {
dbProperties.load(DBUtils(mode).getClass.getClassLoader.getResourceAsStream("dev-db.properties"))
}
if (PROD.equals(mode)) {
dbProperties.load(DBUtils(mode).getClass.getClassLoader.getResourceAsStream("prod-db.properties"))
}
dbProperties
}
def getDBConfig(mode: String) = {
if (mode == null) {
println("需要正确的参数 mode 属性")
}
val properties = getDBProperties(mode)
val url = properties.getProperty("bi.bigdata.spark.ml.gp.url")
val driver = properties.getProperty("bi.bigdata.spark.ml.gp.driver")
val username = properties.getProperty("bi.bigdata.spark.ml.gp.username")
val password = properties.getProperty("bi.bigdata.spark.ml.gp.password")
DBConfig(url, driver, username, password)
}
def sparkDBProp(mode: String): Unit ={
val config = getDBConfig(mode)
val properties = new Properties()
properties.setProperty("user",config.username)
properties.setProperty("password",config.password)
properties.setProperty("url",config.url)
properties.setProperty("driver",config.driver)
}
}
DBConfig为数据库连接工具配置类
case class DBConfig(url:String,driver:String,username:String,password:String) {}
localhostToCsv 此类为主要调用类
object localhostToCsv {
def main(args: Array[String]): Unit = {
//Spark集群配置的各种参数
val sparkConf = new SparkConf();
//SparkContext的初始化需要一个SparkConf对象
sparkConf.set("spark.testing.memory", "2147480000")
//获取SparkSession
val sess = SparkSession.builder().appName("SQLTest").master("local[*]").config(sparkConf).getOrCreate()
//HDFS文件路径地址
val csvPath = "/leyou/temp/result1.csv"
val csvDf = sess.read
.option("delimiter", ",") //分隔符,默认为逗号,
.option("header", "true") //指定一个字符串代表 null 值
.option("quote", "'") //引号字符,默认为双引号"
.option("nullValue", "\\N") //第一行不作为数据内容,作为标题
.option("inferSchema", "true") //自动推测字段类型
.schema(ScalaReflection.schemaFor[UserDetail].dataType.asInstanceOf[StructType]) //指定csv字段类型
.csv(csvPath)
val csvCols = csvDf.columns
//csvDf.show() //打印csv明细数据,只展示20行
//csvDf.printSchema() //打印数据结构信息包含每列的名称及类型
//设置sprark临时表
csvDf.createTempView("csvView")
//使用sparkSQL进行临时表数据
val frame = sess.sql("select user_id,sale_amount,trans_count,offline_count,online_count,shopping_count,tuihuo_count,tuihuo_lv,apru,create_day,is_gravida,is_dobule_source,baby_day,active_code from csvView")
//新建配置类
val connProperties = new Properties
connProperties.setProperty("driver", "ru.yandex.clickhouse.ClickHouseDriver")
connProperties.setProperty("user", "default")
frame.write.mode(SaveMode.Append).option("batchsize", "100000")
.jdbc("jdbc:clickhouse://cdh2:8123/", "temp.user_Detail_20200416", connProperties)
//关闭SparkSession
sess.stop()
}
}
UserDetail 该类为csv字段映射类,主要用于设置字段类型
case class UserDetail(user_id:Int,sale_amount:Double,trans_count:Int,offline_count:Int,online_count:Int,shopping_count:Int,tuihuo_count:Int,tuihuo_lv:Double,apru:Double,create_day:Int,is_gravida:Int,is_dobule_source:Int,baby_day:Int,active_code:Int) {}
由于该项目是在yarn上运行的,故我们需要将csv文件上传只HDFS上
spark-submit --class com.leyou.bi.uploadFile.localhostToCsv --master yarn --deploy-mode cluster --executor-memory 4G --num-executors 4 --driver-memory 4G --conf spark.default.parallelism=1000 --conf spark.memory.fraction=0.75 --conf spark.memory.storageFraction=0.5 --conf spark.network.timeout=10000000 /opt/apps/spark/jars/UploadFileCsvScala20200416-0.0.1.jar
执行语句完成之后我们可以去yarn WEB管理页面上去我们执行的任务,查看是否运行成功