import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import static java.lang.System.*;
/**
* Author : Jackson
* Version : 2020/1/20 & 1.0
*/
public class WordCountJava {
public static void main(String[] args) {
//获取spark的环境
SparkConf conf = new SparkConf().setAppName("Java").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
//读取数据
JavaRDD javardd = jsc.textFile("D:\\input\\test\\李庆\\wordcount.txt");
//数据的切分
JavaRDD word = javardd.flatMap(new FlatMapFunction() {
@Override
public Iterator call(String s) throws Exception {
String[] s1 = s.split(" ");
return Arrays.asList(s1).iterator();
}
});
//数据拼接
JavaPairRDD
结果:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
20/01/20 19:19:57 INFO SparkContext: Running Spark version 2.1.0
20/01/20 19:19:58 INFO SecurityManager: Changing view acls to: Jackson,ÀîÇì
20/01/20 19:19:58 INFO SecurityManager: Changing modify acls to: Jackson,ÀîÇì
20/01/20 19:19:58 INFO SecurityManager: Changing view acls groups to:
20/01/20 19:19:58 INFO SecurityManager: Changing modify acls groups to:
20/01/20 19:19:58 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Jackson, ÀîÇì); groups with view permissions: Set(); users with modify permissions: Set(Jackson, ÀîÇì); groups with modify permissions: Set()
20/01/20 19:20:01 INFO Utils: Successfully started service 'sparkDriver' on port 56834.
20/01/20 19:20:01 INFO SparkEnv: Registering MapOutputTracker
20/01/20 19:20:01 INFO SparkEnv: Registering BlockManagerMaster
20/01/20 19:20:01 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
20/01/20 19:20:01 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
20/01/20 19:20:01 INFO DiskBlockManager: Created local directory at C:\Users\李庆\AppData\Local\Temp\blockmgr-ec822fe3-38c5-4047-9017-b8ad42757d63
20/01/20 19:20:01 INFO MemoryStore: MemoryStore started with capacity 1989.6 MB
20/01/20 19:20:01 INFO SparkEnv: Registering OutputCommitCoordinator
20/01/20 19:20:01 INFO Utils: Successfully started service 'SparkUI' on port 4040.
20/01/20 19:20:01 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.2.1:4040
20/01/20 19:20:01 INFO Executor: Starting executor ID driver on host localhost
20/01/20 19:20:02 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 56848.
20/01/20 19:20:02 INFO NettyBlockTransferService: Server created on 192.168.2.1:56848
20/01/20 19:20:02 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
20/01/20 19:20:02 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 192.168.2.1, 56848, None)
20/01/20 19:20:02 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.2.1:56848 with 1989.6 MB RAM, BlockManagerId(driver, 192.168.2.1, 56848, None)
20/01/20 19:20:02 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 192.168.2.1, 56848, None)
20/01/20 19:20:02 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.2.1, 56848, None)
20/01/20 19:20:02 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 127.1 KB, free 1989.5 MB)
20/01/20 19:20:02 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 14.3 KB, free 1989.5 MB)
20/01/20 19:20:02 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.2.1:56848 (size: 14.3 KB, free: 1989.6 MB)
20/01/20 19:20:02 INFO SparkContext: Created broadcast 0 from textFile at WordCountJava.java:31
20/01/20 19:20:03 INFO FileInputFormat: Total input paths to process : 1
20/01/20 19:20:03 INFO SparkContext: Starting job: foreach at WordCountJava.java:59
20/01/20 19:20:03 INFO DAGScheduler: Registering RDD 3 (mapToPair at WordCountJava.java:43)
20/01/20 19:20:03 INFO DAGScheduler: Got job 0 (foreach at WordCountJava.java:59) with 1 output partitions
20/01/20 19:20:03 INFO DAGScheduler: Final stage: ResultStage 1 (foreach at WordCountJava.java:59)
20/01/20 19:20:03 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 0)
20/01/20 19:20:03 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 0)
20/01/20 19:20:03 INFO DAGScheduler: Submitting ShuffleMapStage 0 (MapPartitionsRDD[3] at mapToPair at WordCountJava.java:43), which has no missing parents
20/01/20 19:20:03 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 5.0 KB, free 1989.5 MB)
20/01/20 19:20:03 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.8 KB, free 1989.5 MB)
20/01/20 19:20:03 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.2.1:56848 (size: 2.8 KB, free: 1989.6 MB)
20/01/20 19:20:03 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:996
20/01/20 19:20:03 INFO DAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 0 (MapPartitionsRDD[3] at mapToPair at WordCountJava.java:43)
20/01/20 19:20:03 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
20/01/20 19:20:03 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 5977 bytes)
20/01/20 19:20:03 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
20/01/20 19:20:03 INFO HadoopRDD: Input split: file:/D:/input/test/李庆/wordcount.txt:0+96
20/01/20 19:20:03 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
20/01/20 19:20:03 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
20/01/20 19:20:03 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
20/01/20 19:20:03 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
20/01/20 19:20:03 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
20/01/20 19:20:03 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1816 bytes result sent to driver
20/01/20 19:20:03 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 210 ms on localhost (executor driver) (1/1)
20/01/20 19:20:03 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
20/01/20 19:20:03 INFO DAGScheduler: ShuffleMapStage 0 (mapToPair at WordCountJava.java:43) finished in 0.227 s
20/01/20 19:20:03 INFO DAGScheduler: looking for newly runnable stages
20/01/20 19:20:03 INFO DAGScheduler: running: Set()
20/01/20 19:20:03 INFO DAGScheduler: waiting: Set(ResultStage 1)
20/01/20 19:20:03 INFO DAGScheduler: failed: Set()
20/01/20 19:20:03 INFO DAGScheduler: Submitting ResultStage 1 (ShuffledRDD[4] at reduceByKey at WordCountJava.java:52), which has no missing parents
20/01/20 19:20:03 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 3.1 KB, free 1989.5 MB)
20/01/20 19:20:03 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 1870.0 B, free 1989.4 MB)
20/01/20 19:20:03 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on 192.168.2.1:56848 (size: 1870.0 B, free: 1989.6 MB)
20/01/20 19:20:03 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:996
20/01/20 19:20:03 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (ShuffledRDD[4] at reduceByKey at WordCountJava.java:52)
20/01/20 19:20:03 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
20/01/20 19:20:03 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, executor driver, partition 0, ANY, 5750 bytes)
20/01/20 19:20:03 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
20/01/20 19:20:03 INFO ShuffleBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks
20/01/20 19:20:03 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 4 ms
(aa,2)
(gg,2)
(spark,2)
(hadoop,2)
(dd,2)
(hadopp,2)
(linux,2)
(ff,2)
(bb,2)
(cc,2)
20/01/20 19:20:03 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 1632 bytes result sent to driver
20/01/20 19:20:03 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 38 ms on localhost (executor driver) (1/1)
20/01/20 19:20:03 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
20/01/20 19:20:03 INFO DAGScheduler: ResultStage 1 (foreach at WordCountJava.java:59) finished in 0.038 s
20/01/20 19:20:03 INFO DAGScheduler: Job 0 finished: foreach at WordCountJava.java:59, took 0.403764 s
20/01/20 19:20:03 INFO SparkContext: Invoking stop() from shutdown hook
20/01/20 19:20:03 INFO SparkUI: Stopped Spark web UI at http://192.168.2.1:4040
20/01/20 19:20:03 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/01/20 19:20:03 INFO MemoryStore: MemoryStore cleared
20/01/20 19:20:03 INFO BlockManager: BlockManager stopped
20/01/20 19:20:03 INFO BlockManagerMaster: BlockManagerMaster stopped
20/01/20 19:20:03 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/01/20 19:20:03 INFO SparkContext: Successfully stopped SparkContext
20/01/20 19:20:03 INFO ShutdownHookManager: Shutdown hook called
20/01/20 19:20:03 INFO ShutdownHookManager: Deleting directory C:\Users\李庆\AppData\Local\Temp\spark-d674268f-2495-4339-a2ee-543d2285308f
Process finished with exit code 0
————保持饥饿,保持学习
Jackson_MVP