spark-1.6.3-bin-hadoop2.6、hadoop-2.6.4、jdk1.7.0_67、IDEA14.1.5 ;
Hadoop集群采用伪分布式安装,运行过程中只启动HDFS;Spark只启动一个Worker;使用虚拟机搭建Hadoop、Spark集群;Idea直接安装在Win10上;192.168.128.128是虚拟机ip;本机ip是:192.168.0.183;
Java连接Spark集群,如果采用YARN的方式,可以参考:Java Web提交任务到Spark ;写此篇的初衷是,在使用的过程中发现使用YARN调用Spark集群效率太低,所以尝试使用Java直接连接Spark Standalone集群。同时,需要说明一点,这里使用的是一个节点,如果使用多个节点情况可能有所不同。
本次测试一共进行了5次实验,最终达到一个既可以连接Spark Standalone集群,同时可以监控该任务的目的。所有代码可以在 https://github.com/fansy1990/JavaConnectSaprk01 下载。
package demo
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by fansy on 2017/7/5.
*/
object WordCount {
def main(args: Array[String]) {
val input = "hdfs://192.168.128.128:8020/user/root/magic"
val output =""
val appName = "word count"
val master = "spark://192.168.128.128:7077"
val conf = new SparkConf().setAppName(appName).setMaster(master)
val sc = new SparkContext(conf)
val line = sc.textFile(input)
line.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).collect().foreach(println)
sc.stop()
}
}
这里面直接设置spark集群中的地址,然后连接运行任务,运行完成后,执行sc.stop,关闭SparkContext,但是运行后出现错误:
val jars =Array("C:\\Users\\fansy\\workspace_idea_tmp\\JavaConnectSaprk01\\out\\artifacts\\wordcount\\wordcount.jar")
val conf = new SparkConf().setAppName(appName).setMaster(master).setJars(jars)
val conf = new SparkConf().setAppName(appName).setMaster(master).setJars(jars)
.set("spark.eventLog.enabled","true")
.set("spark.eventLog.dir","hdfs://node10:8020/eventLog")
.set("spark.driver.host","192.168.128.128")
.set("spark.driver.port","8993")
val sc = new SparkContext(conf)
运行,发现提交作业都提交不了了,暂时没有发现原因,好像把Driver设置到其他节点上面这种方式是有问题的(至少目前对于Standalone这种模式来说)。
package demo03;
import java.util.concurrent.Callable;
/**
* 线程任务
* Created by fansy on 2017/7/5.
*/
public class RunTool implements Callable {
private String input;
private String output;
private String appName;
private String master;
private String jars;
private String logEnabled;
private String logDir;
public RunTool(){}
public RunTool(String[] args){
this.input = args[0];
this.output = args[1];
this.appName = args[2];
this.master = args[3];
this.jars = args[4];
this.logEnabled = args[5];
this.logDir = args[6];
}
@Override
public Boolean call() throws Exception {
return WordCount.run(new String[]{input,output,appName,master,jars,logEnabled,logDir});
}
}
线程类采用实现Callable接口,有返回值,根据返回值在主类中进行判断;
package demo03;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
/**
* Created by fansy on 2017/7/5.
*/
public class Driver {
public static void main(String[] args) {
//
package demo04;
import org.apache.spark.SparkContext;
import org.apache.spark.SparkJobInfo;
import org.apache.spark.SparkStatusTracker;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
/**
* Created by fansy on 2017/7/5.
*/
public class Driver {
public static void main(String[] args) throws InterruptedException {
String master = "spark://node10:7077";
String appName = "wordcount" + System.currentTimeMillis();
String[] jars = "C:\\Users\\fansy\\workspace_idea_tmp\\JavaConnectSaprk01\\out\\artifacts\\wordcount\\wordcount.jar".split(",");
String logEnabled = "true";
String logDir = "hdfs://node10:8020/eventLog";
String[] arg = new String[]{
"hdfs://node10:8020/user/root/magic",
""
};
// 1.获取SC
SparkContext sc = Utils.getSc(master, appName, jars, logEnabled, logDir);
// 2. 提交任务 线程
FutureTask future = new FutureTask<>(new WordCount(sc, arg));
new Thread(future).start();
// 3. 监控
String appId = sc.applicationId();
System.out.println("AppId:"+appId);
SparkStatusTracker sparkStatusTracker = null;
int[] jobIds ;
SparkJobInfo jobInfo;
while (!sc.isStopped()) {// 如果sc没有stop,则往下监控
Thread.sleep(2000);
// 获取所有Job
sparkStatusTracker = sc.statusTracker();
jobIds = sparkStatusTracker.getJobIdsForGroup(null);
for(int jobId :jobIds){
jobInfo = sparkStatusTracker.getJobInfo(jobId).getOrElse(null);
if(jobInfo == null){
System.out.println("JobId:"+jobId+",相关信息获取不到!");
}else{
System.out.println("JobId:" + jobId + ",任务状态:" + jobInfo.status().name());
}
}
}
// 4. 检查线程任务是否返回true
boolean flag = true;
while(flag){
try{
Thread.sleep(200);
System.out.println("Job closing ...");
if(future.isDone()){
flag = false;
if(future.get().booleanValue()){
System.out.println("Job "+appId+" done with success state");
}else{
System.out.println("Job "+appId+" failed!");
}
}
}catch (InterruptedException|ExecutionException e){
e.printStackTrace();
}
}
}
}
package demo04
import org.apache.spark.{SparkContext,SparkConf}
/**
* Created by fansy on 2017/7/6.
*/
object Utils {
/**
* 获得sc
* @param master
* @param appName
* @param jars
* @return
*/
def getSc(master:String,appName:String,jars:Array[String],logEnabled:String,logDir:String):SparkContext = {
val conf = new SparkConf().setMaster(master).setAppName(appName).setJars(jars)
.set("spark.eventLog.enabled",logEnabled)
.set("spark.eventLog.dir",logDir)
new SparkContext(conf)
}
}