本文我们用来分析Job的启动过程。
下面我们按照job在server端的提交过程进行分析
JobRequestHandler
JobRequestHandler负责处理关于job的请求报文。当收到启动job的请求时,其执行startJob
函数,startJob函数的处理过程为:
- 取出job和请求报文中的相关信息
- 授权检查
- 调用JobManager的start方法
private JsonBean startJob(RequestContext ctx) {
Repository repository = RepositoryManager.getInstance().getRepository();
String[] elements = ctx.getUrlElements();
String jobIdentifier = elements[elements.length - 2];
long jobId = HandlerUtils.getJobIdFromIdentifier(jobIdentifier, repository);
// Authorization check
AuthorizationEngine.startJob(String.valueOf(jobId));
AuditLoggerManager.getInstance().logAuditEvent(ctx.getUserName(),
ctx.getRequest().getRemoteAddr(), "submit", "job", String.valueOf(jobId));
// TODO(SQOOP-1638): This should be outsourced somewhere more suitable than
// here
if (JobManager.getInstance().getNotificationBaseUrl() == null) {
String url = ctx.getRequest().getRequestURL().toString();
JobManager.getInstance().setNotificationBaseUrl(
url.split("v1")[0] + "/v1/job/status/notification/");
}
MSubmission submission = JobManager.getInstance()
.start(jobId, prepareRequestEventContext(ctx));
return new SubmissionBean(submission);
}
JobManager
JobManager的start方法的主要功能在于将设置JobRequest(抽象类),一个子类为MRJobRequest。MRJobRequest类包含了执行map,reduce任务的各种配置的参数。
start方法的执行过程如下:
- 创建Submission代表执行状态
- 创建JobRequest用于代表Job的配置信息
- executionEngine.prepareJob设置map,reduce任务的一些参数。包括mapper.class,等
- 判断Job是否已经执行
- submissionEngine调用submit方法,提交任务
- 将任务的执行状态Submission即持久化保存在Repositor中。
public MSubmission start(long jobId, HttpEventContext ctx) {
MSubmission mSubmission = createJobSubmission(ctx, jobId);
JobRequest jobRequest = createJobRequest(jobId, mSubmission);
// Bootstrap job to execute in the configured execution engine
prepareJob(jobRequest);
// Make sure that this job id is not currently running and submit the job
// only if it's not.
synchronized (getClass()) {
MSubmission lastSubmission = RepositoryManager.getInstance().getRepository()
.findLastSubmissionForJob(jobId);
if (lastSubmission != null && lastSubmission.getStatus().isRunning()) {
throw new SqoopException(DriverError.DRIVER_0002, "Job with id " + jobId);
}
// NOTE: the following is a blocking call
boolean success = submissionEngine.submit(jobRequest);
if (!success) {
invokeDestroyerOnJobFailure(jobRequest);
mSubmission.setStatus(SubmissionStatus.FAILURE_ON_SUBMIT);
}
// persist submission record to repository.
// on failure we persist the FAILURE status, on success it is the SUCCESS
// status ( which is the default one)
RepositoryManager.getInstance().getRepository().createSubmission(mSubmission);
}
return mSubmission;
}
submissionEngine
MapreduceSubmissionEngine的submit方法利用MRJobRequest中的信息,配置mapreduce 任务job. 设置完成之后,向hadoop集群提交任务
源代码如下:
public boolean submit(JobRequest mrJobRequest) {
// We're supporting only map reduce jobs
MRJobRequest request = (MRJobRequest) mrJobRequest;
// Clone global configuration
Configuration configuration = new Configuration(globalConfiguration);
// Serialize driver context into job configuration
for(Map.Entry entry: request.getDriverContext()) {
if (entry.getValue() == null) {
LOG.warn("Ignoring null driver context value for key " + entry.getKey());
continue;
}
configuration.set(entry.getKey(), entry.getValue());
}
// Serialize connector context as a sub namespace
for(Map.Entry entry : request.getConnectorContext(Direction.FROM)) {
if (entry.getValue() == null) {
LOG.warn("Ignoring null connector context value for key " + entry.getKey());
continue;
}
configuration.set(
MRJobConstants.PREFIX_CONNECTOR_FROM_CONTEXT + entry.getKey(),
entry.getValue());
}
for(Map.Entry entry : request.getConnectorContext(Direction.TO)) {
if (entry.getValue() == null) {
LOG.warn("Ignoring null connector context value for key " + entry.getKey());
continue;
}
configuration.set(
MRJobConstants.PREFIX_CONNECTOR_TO_CONTEXT + entry.getKey(),
entry.getValue());
}
// Set up notification URL if it's available
if(request.getNotificationUrl() != null) {
configuration.set("job.end.notification.url", request.getNotificationUrl());
}
// Turn off speculative execution
configuration.setBoolean("mapred.map.tasks.speculative.execution", false);
configuration.setBoolean("mapred.reduce.tasks.speculative.execution", false);
// Promote all required jars to the job
configuration.set("tmpjars", StringUtils.join(request.getJars(), ","));
try {
Job job = new Job(configuration);
// link configs
MRConfigurationUtils.setConnectorLinkConfig(Direction.FROM, job, request.getConnectorLinkConfig(Direction.FROM));
MRConfigurationUtils.setConnectorLinkConfig(Direction.TO, job, request.getConnectorLinkConfig(Direction.TO));
// from and to configs
MRConfigurationUtils.setConnectorJobConfig(Direction.FROM, job, request.getJobConfig(Direction.FROM));
MRConfigurationUtils.setConnectorJobConfig(Direction.TO, job, request.getJobConfig(Direction.TO));
MRConfigurationUtils.setDriverConfig(job, request.getDriverConfig());
MRConfigurationUtils.setConnectorSchema(Direction.FROM, job, request.getJobSubmission().getFromSchema());
MRConfigurationUtils.setConnectorSchema(Direction.TO, job, request.getJobSubmission().getToSchema());
if(request.getJobName() != null) {
job.setJobName("Sqoop: " + request.getJobName());
} else {
job.setJobName("Sqoop job with id: " + request.getJobId());
}
job.setInputFormatClass(request.getInputFormatClass());
job.setMapperClass(request.getMapperClass());
job.setMapOutputKeyClass(request.getMapOutputKeyClass());
job.setMapOutputValueClass(request.getMapOutputValueClass());
// Set number of reducers as number of configured loaders or suppress
// reduce phase entirely if loaders are not set at all.
if(request.getLoaders() != null) {
job.setNumReduceTasks(request.getLoaders());
} else {
job.setNumReduceTasks(0);
}
job.setOutputFormatClass(request.getOutputFormatClass());
job.setOutputKeyClass(request.getOutputKeyClass());
job.setOutputValueClass(request.getOutputValueClass());
// If we're in local mode than wait on completion. Local job runner do not
// seems to be exposing API to get previously submitted job which makes
// other methods of the submission engine quite useless.
// NOTE: The minicluster mode is not local. It runs similar to a real MR cluster but
// only that it is in the same JVM
if (isLocal()) {
submitToLocalRunner(request, job);
} else {
submitToCluster(request, job);
}
LOG.debug("Executed new map-reduce job with id " + job.getJobID().toString());
} catch (Exception e) {
SubmissionError error = new SubmissionError();
error.setErrorSummary(e.toString());
StringWriter writer = new StringWriter();
e.printStackTrace(new PrintWriter(writer));
writer.flush();
error.setErrorDetails(writer.toString());
request.getJobSubmission().setError(error);
LOG.error("Error in submitting job", e);
return false;
}
return true;
}
SqoopMapper
SqoopMapper类是完成map,reduce任务的Mapper类。其主要是利用Extractor从数据源(目的地)抽取(写入数据)
Extractor
源代码中对于Extractor的描述如下:
/**
* This allows connector to extract data from a source system
* based on each partition.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public abstract class Extractor {
/**
* Extract data from source and pass them into the Sqoop.
*
* @param context Extractor context object
* @param linkConfiguration link configuration object
* @param jobConfiguration FROM job configuration object
* @param partition Partition that this extracter should work on
*/
public abstract void extract(ExtractorContext context,
LinkConfiguration linkConfiguration,
FromJobConfiguration jobConfiguration,
SqoopPartition partition);
/**
* Return the number of rows read by the last call to
* {@linkplain Extractor#extract(org.apache.sqoop.job.etl.ExtractorContext, java.lang.Object, java.lang.Object, Partition) }
* method. This method returns only the number of rows read in the last call,
* and not a cumulative total of the number of rows read by this Extractor
* since its creation. If no calls were made to the run method, this method's
* behavior is undefined.
*
* @return the number of rows read by the last call to
* {@linkplain Extractor#extract(org.apache.sqoop.job.etl.ExtractorContext, java.lang.Object, java.lang.Object, Partition) }
*/
public abstract long getRowsRead();
}
GenericJdbcExtractor
GenericJdbcExtractor 继承Extractor。其是从jdbc类的数据源中读取数据.
其源代码如下:
@Override
public void extract(ExtractorContext context, LinkConfiguration linkConfig, FromJobConfiguration fromJobConfig, GenericJdbcPartition partition) {
GenericJdbcExecutor executor = new GenericJdbcExecutor(linkConfig.linkConfig);
String query = context.getString(GenericJdbcConnectorConstants.CONNECTOR_JDBC_FROM_DATA_SQL);
String conditions = partition.getConditions();
query = query.replace(GenericJdbcConnectorConstants.SQL_CONDITIONS_TOKEN, conditions);
LOG.info("Using query: " + query);
rowsRead = 0;
ResultSet resultSet = executor.executeQuery(query);
Schema schema = context.getSchema();
Column[] schemaColumns = schema.getColumnsArray();
try {
ResultSetMetaData metaData = resultSet.getMetaData();
int columnCount = metaData.getColumnCount();
if (schemaColumns.length != columnCount) {
throw new SqoopException(GenericJdbcConnectorError.GENERIC_JDBC_CONNECTOR_0021, schemaColumns.length + ":" + columnCount);
}
while (resultSet.next()) {
Object[] array = new Object[columnCount];
for (int i = 0; i < columnCount; i++) {
if(resultSet.getObject(i + 1) == null) {
array[i] = null ;
continue;
}
// check type of the column
Column schemaColumn = schemaColumns[i];
switch (schemaColumn.getType()) {
case DATE:
// convert the sql date to JODA time as prescribed the Sqoop IDF spec
array[i] = LocalDate.fromDateFields((java.sql.Date)resultSet.getObject(i + 1));
break;
case DATE_TIME:
// convert the sql date time to JODA time as prescribed the Sqoop IDF spec
array[i] = LocalDateTime.fromDateFields((java.sql.Timestamp)resultSet.getObject(i + 1));
break;
case TIME:
// convert the sql time to JODA time as prescribed the Sqoop IDF spec
array[i] = LocalTime.fromDateFields((java.sql.Time)resultSet.getObject(i + 1));
break;
default:
//for anything else
array[i] = resultSet.getObject(i + 1);
}
}
context.getDataWriter().writeArrayRecord(array);
rowsRead++;
}
} catch (SQLException e) {
throw new SqoopException(
GenericJdbcConnectorError.GENERIC_JDBC_CONNECTOR_0004, e);
} finally {
executor.close();
}
}