Hive on Spark源码分析(一)—— SparkTask
Hive on Spark源码分析(二)—— SparkSession与HiveSparkClient
Hive on Spark源码分析(三)—— SparkClilent与SparkClientImpl(上)
Hive on Spark源码分析(四)—— SparkClilent与SparkClientImpl(下)
Hive on Spark源码分析(五)—— RemoteDriver
Hive on Spark源码分析(六)—— RemoteSparkJobMonitor与JobHandle
SparkClient接口定义了远程Spark客户端的API
// 提交一个异步执行的job,返回一个用于监控job的JobHandle
<T extends Serializable> JobHandle<T> submit(Job<T> job);
// 请求远程context执行job。该方法忽视job队列,建议仅用于执行快速结束的任务。
// 返回一个用于监控job的Future结果
<T extends Serializable> Future<T> run(Job<T> job);
/**
* Stops the remote context.
*
* Any pending jobs will be cancelled, and the remote context will be torn down.
*/
void stop();
/**
* Adds a jar file to the running remote context.
*
* Note that the URL should be reachable by the Spark driver process. If running the driver
* in cluster mode, it may reside on a different host, meaning "file:" URLs have to exist
* on that node (and not on the client machine).
*
* @param uri The location of the jar file.
* @return A future that can be used to monitor the operation.
*/
Future> addJar(URI uri);
/**
* Adds a file to the running remote context.
*
* Note that the URL should be reachable by the Spark driver process. If running the driver
* in cluster mode, it may reside on a different host, meaning "file:" URLs have to exist
* on that node (and not on the client machine).
*
* @param uri The location of the file.
* @return A future that can be used to monitor the operation.
*/
Future> addFile(URI uri);
/**
* Get the count of executors.
*/
Future<Integer> getExecutorCount();
/**
* Get default parallelism. For standalone mode, this can be used to get total number of cores.
*/
Future<Integer> getDefaultParallelism();
/**
* Check if remote context is still active.
*/
boolean isActive();
SparkClientImpl是SparkClient接口的具体实现类。我们来看一下它的具体内容。
首先是构造方法,内容较多,首先对属性赋值:
this.conf = conf;
this.hiveConf = hiveConf;
this.childIdGenerator = new AtomicInteger();
this.jobs = Maps.newConcurrentMap();
String clientId = UUID.randomUUID().toString();
//产生secret用于与remoteDriver建立连接时的身份认证
String secret = rpcServer.createSecret();
//startDriver用来在新的进程中启动RemoteDriver,并返回一个接受执行结果的线程
this.driverThread = startDriver(rpcServer, clientId, secret);
//创建ClientProtocol用于rpc通信,发送消息(提交任务),以及响应远端发来的消息
this.protocol = new ClientProtocol();
try {
// The RPC server will take care of #rpc client connection# timeouts here.
//向远端RPCServer注册rpc客户端
this.driverRpc = rpcServer.registerClient(clientId, secret, protocol).get();
} catch (Throwable e) {
if (e.getCause() instanceof TimeoutException) {
LOG.error("Timed out waiting for client to connect.\\nPossible reasons include network " +
"issues, errors in remote driver or the cluster has no available resources, etc." +
"\\nPlease check YARN or Spark driver\'s logs for further information.\\nReason2 from SparkClientImpl", e);
} else {
LOG.error("Error while waiting for client to connect.", e);
}
//终端driverThread线程
driverThread.interrupt();
try {
//等待driverThread挂掉
driverThread.join();
} catch (InterruptedException ie) {
// Give up.
LOG.debug("Interrupted before driver thread was finished.");
}
throw Throwables.propagate(e);
}
@VisibleForTesting
Future<Rpc> registerClient(final String clientId, String secret,
RpcDispatcher serverDispatcher, long clientTimeoutMs) {
final Promise<Rpc> promise = group.next().newPromise();
Runnable timeout = new Runnable() {
@Override
public void run() {
promise.setFailure(new TimeoutException("Timed out waiting for client connection."));
}
};
//在clientTimeoutMs时间后执行timeout,单位是ms,且仅执行一次.
//根据timeout的run方法,这里就是在timeout时间后,如果promise还没有完成,则执行promise.setFailure
ScheduledFuture> timeoutFuture = group.schedule(timeout,
clientTimeoutMs,
TimeUnit.MILLISECONDS);
final ClientInfo client = new ClientInfo(clientId, promise, secret, serverDispatcher,
timeoutFuture);
//判断是否已经注册过该client了
if (pendingClients.putIfAbsent(clientId, client) != null) {
throw new IllegalStateException(
String.format("Client \'%s\' already registered.", clientId));
}
final ClientInfo client = new ClientInfo(clientId, promise, secret, serverDispatcher,
timeoutFuture);
//判断是否已经注册过该client了
if (pendingClients.putIfAbsent(clientId, client) != null) {
throw new IllegalStateException(
String.format("Client \'%s\' already registered.", clientId));
}
promise.addListener(new GenericFutureListener<Promise<Rpc>>() {
@Override
public void operationComplete(Promise<Rpc> p) {
if (!p.isSuccess()) {
pendingClients.remove(clientId);
}
}
});
return promise;
driverRpc.addListener(new Rpc.Listener() {
@Override
public void rpcClosed(Rpc rpc) {
//如果rpc通信关闭时,当前SparkClient仍然是alive的,则打印warn信息
if (isAlive) {
LOG.warn("Client RPC channel closed unexpectedly.");
isAlive = false;
}
}
});
//实例化后标记为isAlive
isAlive = true;
下面看一下startDriver方法的实现。首先是获得rpcServer的host和port,后面需要用到传给RemoteDriver
private Thread startDriver(final RpcServer rpcServer, final String clientId, final String secret)
throws IOException {
Runnable runnable;
final String serverAddress = rpcServer.getAddress();
final String serverPort = String.valueOf(rpcServer.getPort());
LOG.warn("!!!! Running remote driver in-process. !!!!");
runnable = new Runnable() {
@Override
public void run() {
List<String> args = Lists.newArrayList();
args.add("--remote-host");
args.add(serverAddress);
args.add("--remote-port");
args.add(serverPort);
args.add("--client-id");
args.add(clientId);
args.add("--secret");
args.add(secret);
for (Map.Entry<String, String> e : conf.entrySet()) {
args.add("--conf");
args.add(String.format("%s=%s", e.getKey(), conf.get(e.getKey())));
}
try {
RemoteDriver.main(args.toArray(new String[args.size()]));
} catch (Exception e) {
LOG.error("Error running driver.", e);
}
}
};
}
String sparkHome = conf.get(SPARK_HOME_KEY);
if (sparkHome == null) {
sparkHome = System.getenv(SPARK_HOME_ENV);
}
if (sparkHome == null) {
sparkHome = System.getProperty(SPARK_HOME_KEY);
}
String sparkLogDir = conf.get("hive.spark.log.dir");
if (sparkLogDir == null) {
if (sparkHome == null) {
sparkLogDir = "./target/";
} else {
sparkLogDir = sparkHome + "/logs/";
}
}
File properties = File.createTempFile("spark-submit.", ".properties");
if (!properties.setReadable(false) || !properties.setReadable(true, true)) {
throw new IOException("Cannot change permissions of job properties file.");
}
properties.deleteOnExit();
//用来保存配置
Properties allProps = new Properties();
try {
URL sparkDefaultsUrl = Thread.currentThread().getContextClassLoader().getResource("spark-defaults.conf");
if (sparkDefaultsUrl != null) {
LOG.info("Loading spark defaults: " + sparkDefaultsUrl);
allProps.load(new ByteArrayInputStream(Resources.toByteArray(sparkDefaultsUrl)));
}
} catch (Exception e) {
String msg = "Exception trying to load spark-defaults.conf: " + e;
throw new IOException(msg, e);
}
for (Map.Entry<String, String> e : conf.entrySet()) {
allProps.put(e.getKey(), conf.get(e.getKey()));
}
allProps.put(SparkClientFactory.CONF_CLIENT_ID, clientId);
allProps.put(SparkClientFactory.CONF_KEY_SECRET, secret);
allProps.put(DRIVER_OPTS_KEY, driverJavaOpts);
allProps.put(EXECUTOR_OPTS_KEY, executorJavaOpts);
... ... ...
Writer writer = new OutputStreamWriter(new FileOutputStream(properties), Charsets.UTF_8);
try {
allProps.store(writer, "Spark Context configuration");
} finally {
writer.close();
}
List<String> argv = Lists.newArrayList();
if (hiveConf.getVar(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION).equalsIgnoreCase("kerberos")) {
argv.add("kinit");
String principal = SecurityUtil.getServerPrincipal(hiveConf.getVar(ConfVars.HIVE_SERVER2_KERBEROS_PRINCIPAL),
"0.0.0.0");
String keyTabFile = hiveConf.getVar(ConfVars.HIVE_SERVER2_KERBEROS_KEYTAB);
argv.add(principal);
argv.add("-k");
argv.add("-t");
argv.add(keyTabFile + ";");
}
if (sparkHome != null) {
argv.add(new File(sparkHome, "bin/spark-submit").getAbsolutePath());
} else {
LOG.info("No spark.home provided, calling SparkSubmit directly.");
argv.add(new File(System.getProperty("java.home"), "bin/java").getAbsolutePath());
String master = conf.get("spark.master");
Preconditions.checkArgument(master != null, "spark.master is not defined.");
if (sparkHome != null) {
argv.add(new File(sparkHome, "bin/spark-submit").getAbsolutePath());
} else {
LOG.info("No spark.home provided, calling SparkSubmit directly.");
argv.add(new File(System.getProperty("java.home"), "bin/java").getAbsolutePath());
//如果运行模式为local或client模式(其实就是除了yarn-cluster以为的模式)
if (master.startsWith("local") || master.startsWith("mesos") || master.endsWith("-client") || master.startsWith("spark")) {
String mem = conf.get("spark.driver.memory");
if (mem != null) {
argv.add("-Xms" + mem);
argv.add("-Xmx" + mem);
}
//配置classpath
String cp = conf.get("spark.driver.extraClassPath");
if (cp != null) {
argv.add("-classpath");
argv.add(cp);
}
String libPath = conf.get("spark.driver.extraLibPath");
if (libPath != null) {
argv.add("-Djava.library.path=" + libPath);
}
String extra = conf.get(DRIVER_OPTS_KEY);
if (extra != null) {
for (String opt : extra.split("[ ]")) {
if (!opt.trim().isEmpty()) {
argv.add(opt.trim());
}
}
}
}
argv.add("org.apache.spark.deploy.SparkSubmit")
if (master.equals("yarn-cluster")) {
String executorCores = conf.get("spark.executor.cores");
if (executorCores != null) {
argv.add("--executor-cores");
argv.add(executorCores);
}
String executorMemory = conf.get("spark.executor.memory");
if (executorMemory != null) {
argv.add("--executor-memory");
argv.add(executorMemory);
}
String numOfExecutors = conf.get("spark.executor.instances");
if (numOfExecutors != null) {
argv.add("--num-executors");
argv.add(numOfExecutors);
}
}
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS)) {
try {
String currentUser = Utils.getUGI().getShortUserName();
// do not do impersonation in CLI mode
if (!currentUser.equals(System.getProperty("user.name"))) {
LOG.info("Attempting impersonation of " + currentUser);
argv.add("--proxy-user");
argv.add(currentUser);
}
} catch (Exception e) {
String msg = "Cannot obtain username: " + e;
throw new IllegalStateException(msg, e);
}
}
argv.add("--properties-file");
argv.add(properties.getAbsolutePath());
argv.add("--class");
argv.add(RemoteDriver.class.getName());
String jar = "spark-internal";
if (SparkContext.jarOfClass(this.getClass()).isDefined()) {
jar = SparkContext.jarOfClass(this.getClass()).get();
}
argv.add(jar);
argv.add("--remote-host");
argv.add(serverAddress);
argv.add("--remote-port");
argv.add(serverPort);
for (String hiveSparkConfKey : RpcConfiguration.HIVE_SPARK_RSC_CONFIGS) {
String value = RpcConfiguration.getValue(hiveConf, hiveSparkConfKey);
argv.add("--conf");
argv.add(String.format("%s=%s", hiveSparkConfKey, value));
}
String cmd = Joiner.on(" ").join(argv);
LOG.info("Running client driver with argv: {}", cmd);
ProcessBuilder pb = new ProcessBuilder("sh", "-c", cmd);
// 使Hive配置在spark中不可见,以防互相影响
pb.environment().remove("HIVE_HOME");
pb.environment().remove("HIVE_CONF_DIR");
if (isTesting != null) {
pb.environment().put("SPARK_TESTING", isTesting);
}
final Process child = pb.start();
int childId = childIdGenerator.incrementAndGet();
final List<String> childErrorLog = new ArrayList<String>();
//重定向新进程的输出
redirect("stdout-redir-" + childId, new Redirector(child.getInputStream()));
redirect("stderr-redir-" + childId, new Redirector(child.getErrorStream(), childErrorLog));
runnable = new Runnable() {
@Override
public void run() {
try {
//阻塞等待进程的执行结果
int exitCode = child.waitFor();
if (exitCode != 0) {
StringBuilder errStr = new StringBuilder();
for (String s : childErrorLog) {
errStr.append(s);
errStr.append(\'\\n\');
}
rpcServer.cancelClient(clientId,
"Child process exited before connecting back with error log " + errStr.toString());
LOG.warn("Child process exited with code {}", exitCode);
}
} catch (InterruptedException ie) {
LOG.warn("Waiting thread interrupted, killing child process.");
Thread.interrupted();
child.destroy();
} catch (Exception e) {
LOG.warn("Exception while waiting for child process.", e);
}
}
};
}
Thread thread = new Thread(runnable);
thread.setDaemon(true);
thread.setName("Driver");
thread.start();
return thread;