07 ,分文件,库存流水.csv

1 ,代码 :

package com.lifecycle.tools

import java.io.InputStream
import java.util.Properties

import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object SplitFileStore {
    def main(args: Array[String]): Unit = {
        //  1 ,spark 上下文
        val spark = SparkSession.builder()
            //  为了使用 webUI
            .config("spark.eventLog.enabled", "false")
            //  driver 进程的内存
            .config("spark.driver.memory", "2g")
            //  spark 的 shuffle 数量
            .config("spark.sql.shuffle.partitions", "200")
            .appName("SparkDemoFromS3")
            .getOrCreate()
        //  1 ,日志级别
        spark.sparkContext.setLogLevel("WARN")
        //  2 ,读资源文件
        val properties = new Properties()
        val stream: InputStream = SplitFileSalesFloat.getClass.getClassLoader.getResourceAsStream("s3.properties")
        properties.load(stream)
        //  3 ,设置数据源 ( s3 )
        val sc: SparkContext = spark.sparkContext
        //  配置 shuffle 是的使用的分区数
        spark.sqlContext.setConf("spark.sql.shuffle.partitions", "200")
        sc.hadoopConfiguration.set("fs.s3a.access.key", properties.getProperty("fs.s3a.access.key"))
        sc.hadoopConfiguration.set("fs.s3a.secret.key", properties.getProperty("fs.s3a.secret.key"))
        sc.hadoopConfiguration.set("fs.s3a.endpoint", properties.getProperty("fs.s3a.endpoint"))
        //  4 ,隐式转换
        //  5 ,读文件
        //  区域	门店代码	销售月份	销售日期	款号	颜色	库存变动量	变动原因	店铺类型
        //  鲁豫区	2FJN1R1371	201904	20190418	21081191141	9000	-2	零售	加盟
        //  鲁豫区	2FJN1R1371	201905	20190515	21132191105	9000	-1	零售	加盟
        val dfYuan: DataFrame = spark.read.option("header","true").option("delimiter",",").csv("s3a://lifecyclebigdata/dataWareHouse/BALABALA/01history/2019c/df_库存流水19sum.csv")
        //  6 ,转换列
        val dfTable: DataFrame = dfYuan.toDF("area","dianCode","month","saledate","typeid","colorid","storedefernum","reason","diantype")
        dfTable.createOrReplaceTempView("store")
        val dfRes: DataFrame = spark.sql("select *,month ym from store").toDF
        //  7 ,输出
        dfRes.write.partitionBy("ym").option("header","true").option("delimiter",",").csv("s3a://lifecyclebigdata/dataWareHouse/BALABALA/02pdw/store/res")
        //  9 ,释放资源 :
        spark.close()
    }
}

你可能感兴趣的:(spark,大量实战,spark,大量实战)