Kafka-connect-hdfs源码解析

写数据流程分析

启动task类方法，HdfsSinkTask.java类中start

@Override
public void start(Map props) {
  Set assignment = context.assignment();
  try {
    HdfsSinkConnectorConfig connectorConfig = new HdfsSinkConnectorConfig(props);
    boolean hiveIntegration = connectorConfig.getBoolean(HiveConfig.HIVE_INTEGRATION_CONFIG);
    if (hiveIntegration) {
      StorageSchemaCompatibility compatibility = StorageSchemaCompatibility.getCompatibility(
          connectorConfig.getString(StorageSinkConnectorConfig.SCHEMA_COMPATIBILITY_CONFIG)
      );
      if (compatibility == StorageSchemaCompatibility.NONE) {
        throw new ConfigException(
            "Hive Integration requires schema compatibility to be BACKWARD, FORWARD or FULL"
        );
      }
    }

    //check that timezone it setup correctly in case of scheduled rotation
    if (connectorConfig.getLong(HdfsSinkConnectorConfig.ROTATE_SCHEDULE_INTERVAL_MS_CONFIG) > 0) {
      String timeZoneString = connectorConfig.getString(PartitionerConfig.TIMEZONE_CONFIG);
      if (timeZoneString.equals("")) {
        throw new ConfigException(PartitionerConfig.TIMEZONE_CONFIG,
            timeZoneString, "Timezone cannot be empty when using scheduled file rotation."
        );
      }
      DateTimeZone.forID(timeZoneString);
    }

    int schemaCacheSize = connectorConfig.getInt(
        HdfsSinkConnectorConfig.SCHEMA_CACHE_SIZE_CONFIG
    );
    avroData = new AvroData(schemaCacheSize);
    hdfsWriter = new DataWriter(connectorConfig, context, avroData);   //初始化DataWriter
    recover(assignment);
    if (hiveIntegration) {
      syncWithHive();
    }
  } catch (ConfigException e) {
    throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error.", e);
  } catch (ConnectException e) {
    // Log at info level to help explain reason, but Connect logs the actual exception at ERROR
    log.info("Couldn't start HdfsSinkConnector:", e);
    log.info("Shutting down HdfsSinkConnector.");
    if (hdfsWriter != null) {
      try {
        try {
          log.debug("Closing data writer due to task start failure.");
          hdfsWriter.close();
        } finally {
          log.debug("Stopping data writer due to task start failure.");
          hdfsWriter.stop();
        }
      } catch (Throwable t) {
        log.debug("Error closing and stopping data writer: {}", t.getMessage(), t);
      }
    }
    // Always throw the original exception that prevent us from starting
    throw e;
  }

  log.info("The connector relies on offsets in HDFS filenames, but does commit these offsets to "
      + "Connect to enable monitoring progress of the HDFS connector. Upon startup, the HDFS "
      + "Connector restores offsets from filenames in HDFS. In the absence of files in HDFS, "
      + "the connector will attempt to find offsets for its consumer group in the "
      + "'__consumer_offsets' topic. If offsets are not found, the consumer will "
      + "rely on the reset policy specified in the 'consumer.auto.offset.reset' property to "
      + "start exporting data to HDFS.");
}

初始化DataWriter，DataWriter.java

@SuppressWarnings("unchecked")
public DataWriter(
    HdfsSinkConnectorConfig connectorConfig,
    SinkTaskContext context,
    AvroData avroData,
    Time time
) {
  this.time = time;
  try {
    String hadoopHome = connectorConfig.getString(HdfsSinkConnectorConfig.HADOOP_HOME_CONFIG);
    System.setProperty("hadoop.home.dir", hadoopHome);

    this.connectorConfig = connectorConfig;
    this.avroData = avroData;
    this.context = context;

    String hadoopConfDir = connectorConfig.getString(
        HdfsSinkConnectorConfig.HADOOP_CONF_DIR_CONFIG
    );
    log.info("Hadoop configuration directory {}", hadoopConfDir);
    Configuration conf = connectorConfig.getHadoopConfiguration();
    if (!hadoopConfDir.equals("")) {
      conf.addResource(new Path(hadoopConfDir + "/core-site.xml"));
      conf.addResource(new Path(hadoopConfDir + "/hdfs-site.xml"));
    }

    boolean secureHadoop = connectorConfig.getBoolean(
        HdfsSinkConnectorConfig.HDFS_AUTHENTICATION_KERBEROS_CONFIG
    );
    if (secureHadoop) {
      SecurityUtil.setAuthenticationMethod(
          UserGroupInformation.AuthenticationMethod.KERBEROS,
          conf
      );
      String principalConfig = connectorConfig.getString(
          HdfsSinkConnectorConfig.CONNECT_HDFS_PRINCIPAL_CONFIG
      );
      String keytab = connectorConfig.getString(
          HdfsSinkConnectorConfig.CONNECT_HDFS_KEYTAB_CONFIG
      );

      if (principalConfig == null || keytab == null) {
        throw new ConfigException(
            "Hadoop is using Kerberos for authentication, you need to provide both a connect "
                + "principal and the path to the keytab of the principal.");
      }

      conf.set("hadoop.security.authentication", "kerberos");
      conf.set("hadoop.security.authorization", "true");
      String hostname = InetAddress.getLocalHost().getCanonicalHostName();
      String namenodePrincipalConfig = connectorConfig.getString(
          HdfsSinkConnectorConfig.HDFS_NAMENODE_PRINCIPAL_CONFIG
      );

      String namenodePrincipal = SecurityUtil.getServerPrincipal(
          namenodePrincipalConfig,
          hostname
      );
      // namenode principal is needed for multi-node hadoop cluster
      if (conf.get("dfs.namenode.kerberos.principal") == null) {
        conf.set("dfs.namenode.kerberos.principal", namenodePrincipal);
      }
      log.info("Hadoop namenode principal: " + conf.get("dfs.namenode.kerberos.principal"));

      UserGroupInformation.setConfiguration(conf);
      // replace the _HOST specified in the principal config to the actual host
      String principal = SecurityUtil.getServerPrincipal(principalConfig, hostname);
      UserGroupInformation.loginUserFromKeytab(principal, keytab);
      final UserGroupInformation ugi = UserGroupInformation.getLoginUser();
      log.info("Login as: " + ugi.getUserName());

      final long renewPeriod = connectorConfig.getLong(
          HdfsSinkConnectorConfig.KERBEROS_TICKET_RENEW_PERIOD_MS_CONFIG
      );

      isRunning = true;
      ticketRenewThread = new Thread(new Runnable() {
        @Override
        public void run() {
          synchronized (DataWriter.this) {
            while (isRunning) {
              try {
                DataWriter.this.wait(renewPeriod);
                if (isRunning) {
                  ugi.reloginFromKeytab();
                }
              } catch (IOException e) {
                // We ignore this exception during relogin as each successful relogin gives
                // additional 24 hours of authentication in the default config. In normal
                // situations, the probability of failing relogin 24 times is low and if
                // that happens, the task will fail eventually.
                log.error("Error renewing the ticket", e);
              } catch (InterruptedException e) {
                // ignored
              }
            }
          }
        }
      });
      log.info("Starting the Kerberos ticket renew thread with period {}ms.", renewPeriod);
      ticketRenewThread.start();
    }

    url = connectorConfig.getUrl();
    topicsDir = connectorConfig.getString(StorageCommonConfig.TOPICS_DIR_CONFIG);

    @SuppressWarnings("unchecked")
    Class storageClass = (Class) connectorConfig
        .getClass(StorageCommonConfig.STORAGE_CLASS_CONFIG);
    storage = io.confluent.connect.storage.StorageFactory.createStorage(
        storageClass,
        HdfsSinkConnectorConfig.class,
        connectorConfig,
        url
    );

    createDir(topicsDir);
    createDir(topicsDir + HdfsSinkConnectorConstants.TEMPFILE_DIRECTORY);
    String logsDir = connectorConfig.getString(HdfsSinkConnectorConfig.LOGS_DIR_CONFIG);
    createDir(logsDir);

    // Try to instantiate as a new-style storage-common type class, then fall back to old-style
    // with no parameters
    try {
      Class formatClass =
          (Class)
              connectorConfig.getClass(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG);
      newFormat = formatClass.getConstructor(HdfsStorage.class).newInstance(storage);
      newWriterProvider = newFormat.getRecordWriterProvider();
      schemaFileReader = newFormat.getSchemaFileReader();
    } catch (NoSuchMethodException e) {
      Class formatClass =
          (Class) connectorConfig.getClass(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG);
      format = formatClass.getConstructor().newInstance();
      writerProvider = format.getRecordWriterProvider();
      final io.confluent.connect.hdfs.SchemaFileReader oldReader
          = format.getSchemaFileReader(avroData);
      schemaFileReader = new SchemaFileReader() {
        @Override
        public Schema getSchema(HdfsSinkConnectorConfig hdfsSinkConnectorConfig, Path path) {
          try {
            return oldReader.getSchema(hdfsSinkConnectorConfig.getHadoopConfiguration(), path);
          } catch (IOException e) {
            throw new ConnectException("Failed to get schema", e);
          }
        }

        @Override
        public Iterator

Kafka-connect-hdfs源码解析

写数据流程分析

你可能感兴趣的:(Kafka-connect-hdfs源码解析)