书接上篇,需求同上。将读取的本地文件路径改为aws s3路径,实现如下:
1. 直接 上pom文件
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-------------------------
2. 上代码:
package org.example.JavaDemo;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Properties;
public class SparkSqlCsvToCsv {
public static void main(String[] args) {
/**
* 中国区域s3.cn-north-1.amazonaws.com.cn
* 宁夏cn-northwest-1,北京cn-north-1
*/
System.out.println("=========0000=========");
String hdfsInAddress = "s3a://emr-demo-input/mydata/";//"hdfs://192.168.209.129:9000/"; //server ip //D:\DevTemp\AWS\ ;s3://emr-demo-input/mydata/
String inputAddress = "";//"in/";
String csvFileName="emr-demo-data.csv";
System.out.println("======111============");
SparkConf conf = new SparkConf().setMaster("local").setAppName("TestSpark");
System.out.println("=========222=========");
/*
* Properties properties = new Properties(); InputStream inputStream =
* Object.class.getResourceAsStream("/s3.properties");
* properties.load(inputStream);
*/
System.out.println("=========333=========");
JavaSparkContext sc = new JavaSparkContext(conf);//JavaSparkContext过时
// SparkContext sc = new SparkContext(conf);
System.out.println("=========444-1=========");
/*
* sc.hadoopConfiguration().set("fs.s3a.access.key",properties.getProperty(
* "fs.s3a.access.key"));
* sc.hadoopConfiguration().set("fs.s3a.secret.key",properties.getProperty(
* "fs.s3a.secret.key"));
* sc.hadoopConfiguration().set("fs.s3a.endpoint",properties.getProperty(
* "fs.s3a.endpoint"));//spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
*/
/* spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key=ACCESSKEY
spark.hadoop.fs.s3a.secret.key=SECRETKEY
*/
//sc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
sc.hadoopConfiguration().set("fs.s3a.access.key","AKIA2CIDQ6XXXXXXXXX");
sc.hadoopConfiguration().set("fs.s3a.secret.key","VR1spXe+Jb5pK4m1gKcBFXXXXXXXXXXXX");
sc.hadoopConfiguration().set("fs.s3a.endpoint","s3.cn-northwest-1.amazonaws.com.cn");//这里使用的是宁夏服务器
System.out.println("=========444=========");
SQLContext sqlContext = new SQLContext(sc);
System.out.println("=========555=========");
HashMap
options.put("header", "true");//设置第一行为头
options.put("inferSchema", "true");//设置自动分析片段类型
//options.put("path", hdfsInAddress + inputAddress + filePath);
options.put("path", hdfsInAddress + inputAddress + csvFileName);
options.put("dateFormat","YYYY-MM-DD");
System.out.println("打印上传文件在hdfs的路径:"+hdfsInAddress + inputAddress + csvFileName);
System.out.println("=========666=========");
/****声明字段类型****/
StructField structFields[] = new StructField[9];
structFields[0] = DataTypes.createStructField("Tier", DataTypes.StringType,true);
structFields[1] = DataTypes.createStructField("SellerCode",DataTypes.StringType,true);
structFields[2] = DataTypes.createStructField("SellerName",DataTypes.StringType,true);
structFields[3] = DataTypes.createStructField("DataSource",DataTypes.StringType,true);
structFields[4] = DataTypes.createStructField("SellerProvince",DataTypes.StringType,true);
structFields[5] = DataTypes.createStructField("_201901",DataTypes.DoubleType,true);
structFields[6] = DataTypes.createStructField("_201902",DataTypes.DoubleType,true);
structFields[7] = DataTypes.createStructField("_201903",DataTypes.DoubleType,true);
structFields[8] = DataTypes.createStructField("flag",DataTypes.StringType,true);
StructType structType = new StructType(structFields);
System.out.println("=========777=========");
Dataset dataFrame = sqlContext.load("com.databricks.spark.csv", structType, options);
System.out.println("=========8888=========");
// DataFrame cars = (new CsvParser()).withUseHeader(true).csvFile(sqlContext, "cars.csv");//通过CsvParser里面的函数来读取CSV文件
dataFrame.registerTempTable("result");
System.out.println("=========9999=========");
StringBuffer sparkSql = new StringBuffer("select ");
sparkSql.append("Tier");
sparkSql.append(", SellerCode");
sparkSql.append(", SellerName");
sparkSql.append(", DataSource");
sparkSql.append(", SellerProvince");
sparkSql.append(", _201901");
sparkSql.append(", _201902");
sparkSql.append(", _201903");
sparkSql.append(", if(_201903>_201902,'up','down') as flag");
sparkSql.append(" from result");
Dataset resultFrame=sqlContext.sql(sparkSql.toString() );
//resultFrame.createOrReplaceTempView("resultView");//创建视图
//System.out.println("***************用Dataset打印*peopleScore********"+resultFrame.limit(10).showString(20,0,false));
System.out.println("******print schema *******");
resultFrame.printSchema();
System.out.println("*************");
//resultFrame.select("SellerName").show();
System.out.println("*************");
//Tier SellerCode SellerName DataSource SellerProvince _201901 _201902 _201903
Dataset df = resultFrame.select(
resultFrame.col("Tier"),
resultFrame.col("SellerCode"),
resultFrame.col("SellerName"),
resultFrame.col("DataSource"),
resultFrame.col("SellerProvince"),
resultFrame.col("_201901"),
resultFrame.col("_201902"),
resultFrame.col("_201903"),
resultFrame.col("flag")
);
df = df.filter(df.col("Tier").contains("T"));//where condition:equalTo/
//df = df.filter((df.col("_201902").cast(DataTypes.FloatType)).gt((df.col("201901").cast(DataTypes.FloatType))));//gt 大于
//df = df.orderBy(df.col("_201902").cast(DataTypes.FloatType).asc_nulls_first());//转换类型并升序
//df.groupBy("age").count();//分组
System.out.println("******df.show() print schema *******");
df.show();
System.out.println("******df.show() print schema *******");
/*************将结果写入到 mysql 数据库******************/
//数据库连接
String url = "jdbc:mysql://127.0.0.1:3306/hive?useUnicode=true&characterEncoding=utf-8";
Properties connectionProperties = new Properties();
connectionProperties.put("user","root");
connectionProperties.put("password","123456");
connectionProperties.put("driver","com.mysql.jdbc.Driver");
/**插入数据库表中**/
df.write().mode(SaveMode.Overwrite).jdbc(url,"t_result",connectionProperties);//Overwrite会覆盖数据和表结构
sc.stop();
}
}
------------------
run 结果:
=========0000=========
======111============
=========222=========
=========333=========
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
20/03/18 08:58:18 INFO SparkContext: Running Spark version 2.4.3
20/03/18 08:58:18 WARN Shell: Did not find winutils.exe: {}
java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:528)
at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:549)
at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:572)
at org.apache.hadoop.util.Shell.
at org.apache.hadoop.util.StringUtils.
at org.apache.hadoop.conf.Configuration.getBoolean(Configuration.java:1555)
at org.apache.hadoop.security.SecurityUtil.getLogSlowLookupsEnabled(SecurityUtil.java:497)
at org.apache.hadoop.security.SecurityUtil.
at org.apache.hadoop.security.UserGroupInformation.initialize(UserGroupInformation.java:293)
at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:281)
at org.apache.hadoop.security.UserGroupInformation.loginUserFromSubject(UserGroupInformation.java:837)
at org.apache.hadoop.security.UserGroupInformation.getLoginUser(UserGroupInformation.java:807)
at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:680)
at org.apache.spark.util.Utils$$anonfun$getCurrentUserName$1.apply(Utils.scala:2422)
at org.apache.spark.util.Utils$$anonfun$getCurrentUserName$1.apply(Utils.scala:2422)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.util.Utils$.getCurrentUserName(Utils.scala:2422)
at org.apache.spark.SparkContext.
at org.apache.spark.api.java.JavaSparkContext.
at org.example.JavaDemo.SparkSqlCsvToCsv.main(SparkSqlCsvToCsv.java:40)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:448)
at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:419)
at org.apache.hadoop.util.Shell.
... 16 more
20/03/18 08:58:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/03/18 08:58:19 INFO SparkContext: Submitted application: TestSpark
20/03/18 08:58:19 INFO SecurityManager: Changing view acls to: Ace
20/03/18 08:58:19 INFO SecurityManager: Changing modify acls to: Ace
20/03/18 08:58:19 INFO SecurityManager: Changing view acls groups to:
20/03/18 08:58:19 INFO SecurityManager: Changing modify acls groups to:
20/03/18 08:58:19 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Ace); groups with view permissions: Set(); users with modify permissions: Set(Ace); groups with modify permissions: Set()
20/03/18 08:58:20 INFO Utils: Successfully started service 'sparkDriver' on port 53335.
20/03/18 08:58:20 INFO SparkEnv: Registering MapOutputTracker
20/03/18 08:58:20 INFO SparkEnv: Registering BlockManagerMaster
20/03/18 08:58:20 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
20/03/18 08:58:20 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
20/03/18 08:58:20 INFO DiskBlockManager: Created local directory at C:\Users\Lenovo\AppData\Local\Temp\blockmgr-49830c1c-118a-4540-a84b-6ba29b2c3bd8
20/03/18 08:58:21 INFO MemoryStore: MemoryStore started with capacity 1984.5 MB
20/03/18 08:58:21 INFO SparkEnv: Registering OutputCommitCoordinator
20/03/18 08:58:21 INFO Utils: Successfully started service 'SparkUI' on port 4040.
20/03/18 08:58:21 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://Ace-Sun:4040
20/03/18 08:58:21 INFO Executor: Starting executor ID driver on host localhost
20/03/18 08:58:21 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53348.
20/03/18 08:58:21 INFO NettyBlockTransferService: Server created on Ace-Sun:53348
20/03/18 08:58:21 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
20/03/18 08:58:21 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManagerMasterEndpoint: Registering block manager Ace-Sun:53348 with 1984.5 MB RAM, BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, Ace-Sun, 53348, None)
=========444-1=========
=========444=========
=========555=========
打印上传文件在hdfs的路径:s3a://emr-demo-input/mydata/emr-demo-data.csv
=========666=========
=========777=========
20/03/18 08:58:21 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/D:/DevWorkspase/eclipse-workspace/JavaDemo/spark-warehouse/').
20/03/18 08:58:21 INFO SharedState: Warehouse path is 'file:/D:/DevWorkspase/eclipse-workspace/JavaDemo/spark-warehouse/'.
20/03/18 08:58:22 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
=========8888=========
=========9999=========
******print schema *******
root
|-- Tier: string (nullable = true)
|-- SellerCode: string (nullable = true)
|-- SellerName: string (nullable = true)
|-- DataSource: string (nullable = true)
|-- SellerProvince: string (nullable = true)
|-- _201901: double (nullable = true)
|-- _201902: double (nullable = true)
|-- _201903: double (nullable = true)
|-- flag: string (nullable = false)
*************
*************
******df.show() print schema *******
20/03/18 08:58:28 INFO FileSourceStrategy: Pruning directories with:
20/03/18 08:58:28 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(Tier#0),Contains(Tier#0, T)
20/03/18 08:58:28 INFO FileSourceStrategy: Output Data Schema: struct
20/03/18 08:58:28 INFO FileSourceScanExec: Pushed Filters: IsNotNull(Tier),StringContains(Tier,T)
20/03/18 08:58:28 INFO CodeGenerator: Code generated in 271.4181 ms
20/03/18 08:58:28 INFO CodeGenerator: Code generated in 35.9997 ms
20/03/18 08:58:28 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 244.1 KB, free 1984.3 MB)
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 20.0 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on Ace-Sun:53348 (size: 20.0 KB, free: 1984.5 MB)
20/03/18 08:58:29 INFO SparkContext: Created broadcast 0 from show at SparkSqlCsvToCsv.java:126
20/03/18 08:58:29 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4199352 bytes, open cost is considered as scanning 4194304 bytes.
20/03/18 08:58:29 INFO SparkContext: Starting job: show at SparkSqlCsvToCsv.java:126
20/03/18 08:58:29 INFO DAGScheduler: Got job 0 (show at SparkSqlCsvToCsv.java:126) with 1 output partitions
20/03/18 08:58:29 INFO DAGScheduler: Final stage: ResultStage 0 (show at SparkSqlCsvToCsv.java:126)
20/03/18 08:58:29 INFO DAGScheduler: Parents of final stage: List()
20/03/18 08:58:29 INFO DAGScheduler: Missing parents: List()
20/03/18 08:58:29 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[3] at show at SparkSqlCsvToCsv.java:126), which has no missing parents
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 14.9 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 7.2 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on Ace-Sun:53348 (size: 7.2 KB, free: 1984.5 MB)
20/03/18 08:58:29 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1161
20/03/18 08:58:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[3] at show at SparkSqlCsvToCsv.java:126) (first 15 tasks are for partitions Vector(0))
20/03/18 08:58:29 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
20/03/18 08:58:29 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 8323 bytes)
20/03/18 08:58:29 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
20/03/18 08:58:29 INFO FileScanRDD: Reading File path: s3a://emr-demo-input/mydata/emr-demo-data.csv, range: 0-5048, partition values: [empty row]
20/03/18 08:58:29 INFO CodeGenerator: Code generated in 22.3421 ms
20/03/18 08:58:30 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 3241 bytes result sent to driver
20/03/18 08:58:30 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 596 ms on localhost (executor driver) (1/1)
20/03/18 08:58:30 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
20/03/18 08:58:30 INFO DAGScheduler: ResultStage 0 (show at SparkSqlCsvToCsv.java:126) finished in 0.700 s
20/03/18 08:58:30 INFO DAGScheduler: Job 0 finished: show at SparkSqlCsvToCsv.java:126, took 0.737439 s
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
|Tier|SellerCode| SellerName|DataSource|SellerProvince| _201901| _201902| _201903|flag|
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
| T1| HE003| 医药有限公司| DDI| 河北|1.0559443903E8|3.312345429E7|5.428380069E7| up|
| T1| HE0009S|河公司田医药站| DDI| 河北| 249239.76| 168139.14| 260403.56| up|
| T1| HE006S| 河药集团有限公司| DDI| 河北| 3856199.08| 1384355.4| 4070853.03| up|
| T1| HEA1S| 邢医药药材有限公司| DDI| 河北| 405327.83| 63712.79| 89365.28| up|
| T1| H865S| 衡水医药有限公司| DDI| 河北| 648096.6| 188102.8| 239028.8| up|
| T1| HEA3S| 保定医药有限公司| DDI| 河北| 794278.6| 143358.86| 280220.74| up|
| T1| HB001|唐公司(新分公司)| DDI| 河北| 2844517.25| 1066305.9| 1154788.35| up|
| T1| T18S| 华医药有限公司| DDI| 河北| 1.986586353E7| 3419255.58| 9636006.07| up|
| T1| T34S| 国药控有限公司| DDI| 河北| 2073843.21| 698878.7| 799672.08| up|
| T2| H135S| 国药医药有限公司| DDI| 河北| 161440.74| 111466.11| 111115.2|down|
| T2| HE3S| 国药堂医药有限公司| DDI| 河北| 6660979.13| 1417602.22| 2650979.14| up|
| T2| HE9S| 国药堂公司| DDI| 河北| 4707805.76| 1884585.75| 2670068.27| up|
| T2| H17S| 国药岛医药有限公司| DDI| 河北| 2889987.07| 997135.23| 1670409.38| up|
| T2| H0368S| 国药堂坊医药有限公司| DDI| 河北| 2563005.46| 810446.44| 1546372.15| up|
| T2| H0593S| 国药堂药有限公司| DDI| 河北| 5412119.26| 1241300.64| 1654506.05| up|
| T2| 1006S| 河贸易有限公司| DDI| 河北| 31847.58| 6605.72| 9101.52| up|
| T2| 1206S| 承盛限责任公司| DDI| 河北| 372629.21| 81697.68| 157634.23| up|
| T2| H227S| 国药庄医药有限公司| DDI| 河北| 1382932.07| 243595.74| 892387.49| up|
| T2| H1S| 有限公司| DDI| 河北| 1581317.58| 1268579.3| 1270598.91| up|
| T2| 330S| 华有限公司| DDI| 河北| 2133488.87| 685468.02| 1198794.77| up|
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
only showing top 20 rows
******df.show() print schema *******
20/03/18 08:58:30 INFO SparkUI: Stopped Spark web UI at http://Ace-Sun:4040
20/03/18 08:58:30 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/03/18 08:58:30 INFO MemoryStore: MemoryStore cleared
20/03/18 08:58:30 INFO BlockManager: BlockManager stopped
20/03/18 08:58:30 INFO BlockManagerMaster: BlockManagerMaster stopped
20/03/18 08:58:30 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/03/18 08:58:30 INFO SparkContext: Successfully stopped SparkContext
20/03/18 08:58:30 INFO ShutdownHookManager: Shutdown hook called
20/03/18 08:58:30 INFO ShutdownHookManager: Deleting directory C:\Users\Lenovo\AppData\Local\Temp\spark-eb60c4d7-fe60-4af6-9eb6-3a2e934d1405
---------------------------
抛出的异常是由于本地没有hadoop环境所导致,并不影响运行。
可以将hadoop lib包导入 到本地开发环境,设置hadoop home解决。也可以在本地安装hadoop环境解决。
以上为个人学习研究代码,有冗余请自行优化。
交流学习或指教可添加微信 spsace