Bro_Rabbit

[笔记迁移][Spark][11]Spark源码——内核架构4

文章目录

8. TaskScheduler——分发Task至Executor
9. Executor
10. Task

8. TaskScheduler——分发Task至Executor

  /**
   * TaskScheduler提交Task的入口
   */
  override def submitTasks (taskSet : TaskSet) {
    val tasks = taskSet.tasks
    logInfo("Adding task set " + taskSet .id + " with " + tasks .length + " tasks" )
    this. synchronized {
      //给每一个TaskSet，创建一个TasSetkManager(负责它所对应的TaskSet的任务执行状况监视和管理)
      val manager = createTaskSetManager( taskSet, maxTaskFailures)
      val stage = taskSet.stageId
      val stageTaskSets =
        taskSetsByStageIdAndAttempt.getOrElseUpdate (stage , new HashMap[Int, TaskSetManager])
      //加入内存缓存中
      stageTaskSets( taskSet. stageAttemptId) = manager
      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
        ts. taskSet != taskSet && !ts.isZombie
      }
      if (conflictingTaskSet) {
        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
          s " ${ stageTaskSets.toSeq .map {_._2 .taskSet .id}.mkString ("," )}" )
      }
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

      if (! isLocal && !hasReceivedTask ) {
        starvationTimer.scheduleAtFixedRate (new TimerTask() {
          override def run () {
            if (!hasLaunchedTask) {
              logWarning("Initial job has not accepted any resources; " +
                "check your cluster UI to ensure that workers are registered " +
                "and have sufficient resources" )
            } else {
              this.cancel ()
            }
          }
        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
      }
      hasReceivedTask = true
    }
   
    // SparkContext初始化，创建TaskScheduler时，关键是创建SchedulerBackend，这里的 backend就是当初创建的那个
    //负责创建AppClient，向Master注册Application
    backend.reviveOffers() 
    //底层调用CoarseGrainedSchedulerBackend.reviveOffers{driverEndpoint.send(ReviveOffers)}
  }

/**
 * Schedules the tasks within a single TaskSet in the TaskSchedulerImpl. This class keeps track of
 * each task, retries tasks if they fail (up to a limited number of times), and
 * handles locality-aware(本地化) scheduling for this TaskSet via delay scheduling. The main interfaces
 * to it are resourceOffer, which asks the TaskSet whether it wants to run a task on one node,
 * and statusUpdate, which tells it that one of its tasks changed state (e.g. finished).
 *
 * THREADING: This class is designed to only be called from code with a lock on the
 * TaskScheduler (e.g. its event handlers). It should not be called from other threads.
 *
 * @param sched           the TaskSchedulerImpl associated with the TaskSetManager
 * @param taskSet         the TaskSet to manage scheduling for
 * @param maxTaskFailures if any particular task fails this number of times, the entire
 *                        task set will be aborted
 */
private[spark] class TaskSetManager(
    sched: TaskSchedulerImpl,
    val taskSet: TaskSet,
    val maxTaskFailures: Int,
    blacklistTracker: Option[ BlacklistTracker] = None,
    clock: Clock = new SystemClock ()) extends Schedulable with Logging

//CoarseGrainedSchedulerBackend.reviveOffers() -> makeOffers()    
    // Make fake resource offers on all executors
   private def makeOffers () {
     // Make sure no executor is killed while some task is launching on it
     val taskDescs = CoarseGrainedSchedulerBackend. this.synchronized {
       // Filter out executors under killing
       val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
       val workOffers = activeExecutors .map {
         case (id, executorData) =>
           new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
       }. toIndexedSeq
       //调用TaskSchedulerImpl的resourceOffers()执行任务分配算法，将各个Task分配到executor上
       //传入的参数是该Application所有可用的executor，并将其封装为WorkerOffer，每个WorkerOffer代表了每个Executor可用的cpu资源数量
       scheduler.resourceOffers (workOffers )
     }
     if (! taskDescs.isEmpty ) {
       //分配好Task到Executor后，执行自己的launchTasks()将分配的Task发送LanchTask消息到对应的Executor上去，由Executor启动并执行
       launchTasks(taskDescs )
     }
   }

   //根据分配好的情况，在Executor上启动Task
   // Launch tasks returned by a set of resource offers
   private def launchTasks (tasks : Seq[Seq[TaskDescription]]) {
     for ( task <- tasks. flatten) {
       //将每个Executor要执行的Task信息统一进行序列化操作
       val serializedTask = TaskDescription.encode(task)
       if (serializedTask.limit() >= maxRpcMessageSize) {
         scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
           try {
             var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
               "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
               "spark.rpc.message.maxSize or using broadcast variables for large values."
             msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
             taskSetMgr.abort(msg)
           } catch {
             case e: Exception => logError("Exception in error callback" , e)
           }
         }
       }
       else {
         //找到对应的Executor
         val executorData = executorDataMap(task.executorId)
         //减去要使用的cpu资源
         executorData.freeCores -= scheduler.CPUS_PER_TASK

         logDebug(s "Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
           s "${executorData.executorHost} .")
         //向Executor发送LaunchTask消息，来在Executor上启动Task
         executorData.executorEndpoint.send(LaunchTask( new SerializableBuffer(serializedTask)))
       }
     }
   }

 /**
  * Called by cluster manager to offer resources on slaves. We respond by asking our active task
  * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
  * that tasks are balanced across the cluster.
  */
 def resourceOffers( offers: IndexedSeq[WorkerOffer]): Seq[Seq [TaskDescription]] = synchronized {
   // Mark each slave as alive and remember its hostname
   // Also track if new executor is added
   var newExecAvail = false
   for ( o <- offers) {
     if (! hostToExecutors.contains (o .host )) {
       hostToExecutors(o .host ) = new HashSet[String]()
     }
     if (! executorIdToRunningTaskIds.contains (o .executorId )) {
       hostToExecutors(o .host ) += o .executorId
       executorAdded(o .executorId , o .host )
       executorIdToHost(o .executorId ) = o .host
       executorIdToRunningTaskIds(o .executorId ) = HashSet [Long]()
       newExecAvail = true
     }
     for ( rack <- getRackForHost(o .host )) {
       hostsByRack.getOrElseUpdate (rack , new HashSet[String]() ) += o .host
     }
   }

   // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
   // this here to avoid a separate thread and added synchronization overhead, and also because
   // updating the blacklist is only relevant when task offers are being made.
   blacklistTrackerOpt. foreach(_. applyBlacklistTimeout())

   val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
     offers.filter { offer =>
       ! blacklistTracker.isNodeBlacklisted (offer.host) &&
         !blacklistTracker.isExecutorBlacklisted(offer.executorId)
     }
   }.getOrElse(offers)

   // 将可用的Executor打散，尽量进行负载均衡
   val shuffledOffers = shuffleOffers(filteredOffers )
  
   // Build a list of tasks to assign to each worker.
   // tasks，类似一个二维数组，且每个子ArrayBuffer的数量是固定的，即为该Executor可用的CPU数量
   val tasks = shuffledOffers. map( o => new ArrayBuffer[TaskDescription]( o. cores / CPUS_PER_TASK))
   val availableCpus = shuffledOffers.map (o => o .cores ).toArray

   // 从rootPool中取出排序的TaskSet，rootPool调度池是在TaskSchedulerImpl，SchedulerBackend创建完成后，执行的initialize()创建的
   // 所有提交的TaskSet首先会放入这个调度池，然后再执行Task分配算法时，从这个调度池中，取出排好队的TaskSet
   val sortedTaskSets = rootPool.getSortedTaskSetQueue
   for ( taskSet <- sortedTaskSets) {
     logDebug( "parentName: %s, name: %s, runningTasks: %s" .format(
       taskSet.parent.name, taskSet.name, taskSet.runningTasks))
     if (newExecAvail) {
       taskSet.executorAdded()
     }
   }

   // 分配算法核心，双重for循环，对每个TaskSet从最好的本地化级别遍历到最差级别
   // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
   // of locality levels so that it gets a chance to launch local tasks on all of them.
   // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
   // PROCESS_LOCAL: 进程本地化，RDD的partition和Task进入一个Executor，速度当然快
   // NODE_LOCAL: 节点本地化，RDD的partition和Task不在一个Executor进程，但在一个Worker节点
   // NO_PREF : 没有本地化级别
   // RACK_LOCAL : 机架本地化，直到RDD的partition和Task在同一机架上
   // ANY : 任意本地化级别
   for ( taskSet <- sortedTaskSets) {
     var launchedAnyTask = false
     var launchedTaskAtCurrentMaxLocality = false
     for ( currentMaxLocality <- taskSet.myLocalityLevels) {
       do {
         //对当前TaskSet中的Task，尝试优先使用最小本地化级别在Executor上启动
         //若无法启动，就跳出这个do-while循环，进入下一种本地化级别即放大本地化级别
         //以此类推，直至尝试将TaskSet在某些本地化级别下，让Task在Executor上全部启动
         launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
           taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
         launchedAnyTask |= launchedTaskAtCurrentMaxLocality
       } while (launchedTaskAtCurrentMaxLocality)
     }
     if (!launchedAnyTask) {
       taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
     }
   }

   if (tasks.size > 0) {
     hasLaunchedTask = true
   }
   return tasks
 }

 private def resourceOfferSingleTaskSet (
     taskSet: TaskSetManager,
     maxLocality: TaskLocality,
     shuffledOffers: Seq[WorkerOffer],
     availableCpus: Array[Int],
     tasks: IndexedSeq[ArrayBuffer[ TaskDescription]]) : Boolean = {
   var launchedTask = false
   // nodes and executors that are blacklisted for the entire application have already been
   // filtered out by this point
   //遍历所有Executor
   for ( i <- 0 until shuffledOffers.size ) {
     val execId = shuffledOffers( i). executorId
     val host = shuffledOffers( i). host
     //若当前Executor的cpu数量至少大于每个Task要使用的cpu数量，默认是1
     if (availableCpus( i) >= CPUS_PER_TASK) {
       try {
         //调用TaskSetManager的resorceOffer找到在该Executor上以这种本地化级别可以启动的那些Task
         //TaskSetManager的resorceOffer大致过程：判断该Executor在这种本地化级别之前的等待时间，若本地化级别的等待时间在一定范围内，则认为Task使用该本地化级别可以在Executor上启动
         for (task <- taskSet .resourceOffer (execId , host , maxLocality )) {
           //当如tasks二位数据，给指定的Executor加上要启动的Task
           tasks( i) += task
           //将相应的分配信息加入内存缓存
           val tid = task .taskId
           taskIdToTaskSetManager(tid ) = taskSet
           taskIdToExecutorId(tid ) = execId
           executorIdToRunningTaskIds(execId ).add (tid )
           availableCpus(i) -= CPUS_PER_TASK
           assert( availableCpus(i ) >= 0 )
           launchedTask = true
         }
       } catch {
         case e : TaskNotSerializableException =>
           logError(s"Resource offer failed, task set ${ taskSet. name} was not serializable")
           // Do not offer resources for this task, but don't throw an error to allow other
           // task sets to be submitted.
           return launchedTask
       }
     }
   }
   return launchedTask
 }

9. Executor

[1] Executor反向注册机制

  override def onStart () {
    logInfo("Connecting to driver: " + driverUrl)
    rpcEnv.asyncSetupEndpointRefByURI(driverUrl). flatMap { ref =>
      // This is a very fast action so we can use "ThreadUtils.sameThread"
      driver = Some(ref)
      //CoarseGrainedExecutorBackend启动后直接向Driver发送RegisterExecutor消息
      ref.ask[Boolean](RegisterExecutor(executorId, self, hostname, cores, extractLogUrls))
    }(ThreadUtils.sameThread). onComplete {
      // This is a very fast action so we can use "ThreadUtils.sameThread"
      case Success(msg) =>
        // Always receive `true`. Just ignore it
      case Failure(e) =>
        exitExecutor( 1, s "Cannot register with driver: $driverUrl" , e, notifyDriver = false )
    }(ThreadUtils.sameThread)
  }

  override def receive : PartialFunction [Any, Unit] = {
    //在Driver注册Executor成功后，将回送RegisteredExecutor消息，该CoarseGrainedExecutorBackend创建Executor(用于大部分功能实现)
    case RegisteredExecutor =>
      logInfo("Successfully registered with driver" )
      try {
        executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
      } catch {
        case NonFatal (e ) =>
          exitExecutor(1 , "Unable to create executor due to " + e .getMessage , e )
      }
      // ... ...
    }

[2] Task启动机制

  //TaskSchedulerImpl发送LaunchTask消息给Executor,启动已给该Executor分配的Task
  case LaunchTask(data ) =>
    if (executor == null) {
      exitExecutor(1 , "Received LaunchTask command but executor was null" )
    } else {
      //反序列化TaskDescription
      val taskDesc = TaskDescription .decode (data .value )
      logInfo("Got assigned task " + taskDesc .taskId )
      //启动一个Task
      executor.launchTask( this, taskDesc)
    }

def launchTask(context: ExecutorBackend, taskDescription : TaskDescription): Unit = {
  //TaskRunner extends Runnable
  //对每一个Task创建一个TaskRunner线程
  val tr = new TaskRunner( context, taskDescription)
  //将TaskRunner加入内存缓存ConcurrentHashMap
  runningTasks. put( taskDescription.taskId , tr )
  //Executors.newCachedThreadPool->threadPool，直接将TaskRunner放入线程池执行（排队）
  threadPool.execute(tr)
}

10. Task

Task原理入口

  class TaskRunner(
      execBackend: ExecutorBackend,
      private val taskDescription : TaskDescription)
    extends Runnable {
      //... ...

     
override def run (): Unit = {
      threadId = Thread. currentThread.getId
      Thread. currentThread.setName (threadName )
      val threadMXBean = ManagementFactory. getThreadMXBean
      val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
      val deserializeStartTime = System. currentTimeMillis()
      val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported ) {
        threadMXBean.getCurrentThreadCpuTime
      } else 0L
      Thread. currentThread.setContextClassLoader (replClassLoader )
      val ser = env.closureSerializer.newInstance()
      logInfo(s"Running $ taskName (TID $ taskId)" )
      execBackend. statusUpdate(taskId, TaskState. RUNNING, EMPTY_BYTE_BUFFER)
      var taskStart: Long = 0
      var taskStartCpu: Long = 0
      startGCTime = computeTotalGcTime()

      try {
        // Must be set before updateDependencies() is called, in case fetching dependencies
        // requires access to properties contained within (e.g. for access control).
        // 对序列化的Task数据进行反序列化
        Executor. taskDeserializationProps.set (taskDescription .properties)
        // 通过网络拷贝需要的资源：文件，Jars等
        updateDependencies(taskDescription .addedFiles, taskDescription.addedJars)
        // 通过正式的反序列化操作，将整个Task的数据集反序列化
        task = ser. deserialize[Task[Any]](
          taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)
        task. localProperties = taskDescription.properties
        task.setTaskMemoryManager(taskMemoryManager)

        // If this task has been killed before we deserialized it, let's quit now. Otherwise,
        // continue executing the task.
        val killReason = reasonIfKilled
        if (killReason .isDefined ) {
          // Throw an exception rather than returning, because returning within a try{} block
          // causes a NonLocalReturnControl exception to be thrown. The NonLocalReturnControl
          // exception will be caught by the catch block, leading to an incorrect ExceptionFailure
          // for the task.
          throw new TaskKilledException (killReason .get )
        }

        // The purpose of updating the epoch here is to invalidate executor map output status cache
        // in case FetchFailures have occurred. In local mode `env.mapOutputTracker` will be
        // MapOutputTrackerMaster and its cache invalidation is not based on epoch numbers so
        // we don't need to make any special calls here.
        if (!isLocal ) {
          logDebug("Task " + taskId + "'s epoch is " + task.epoch)
          env.mapOutputTracker.asInstanceOf[MapOutputTrackerWorker].updateEpoch(task.epoch)
        }

        // Run the actual task and measure its runtime.
        taskStart = System.currentTimeMillis ()
        taskStartCpu = if (threadMXBean .isCurrentThreadCpuTimeSupported ) {
          threadMXBean.getCurrentThreadCpuTime
        } else 0L
        var threwException = true
       // KEY!!!最关键的部分，使用Task的run
       // value对ShuffleMapTask来说，其实就是MapStatus，封装了ShuffleMapTask的计算数据，输出的位置
       // 若后面还是一个ShuffleMapTask，就会去联系MapOutputTracker，获取上一个ShuffleMapTask的输出位置，然后通过网络拉取数据
       // ResultTask也是一样的
        val value = try {
          val res = task.run(
            taskAttemptId = taskId,
            attemptNumber = taskDescription.attemptNumber,
            metricsSystem = env.metricsSystem)
          threwException = false
          res
        } finally {
          val releasedLocks = env.blockManager.releaseAllLocksForTask(taskId)
          val freedMemory = taskMemoryManager .cleanUpAllAllocatedMemory ()

          if (freedMemory > 0 && !threwException) {
            val errMsg = s"Managed memory leak detected; size = $ freedMemory bytes, TID = $taskId"
            if (conf.getBoolean("spark.unsafe.exceptionOnMemoryLeak" , false)) {
              throw new SparkException( errMsg)
            } else {
              logWarning(errMsg)
            }
          }

          if (releasedLocks .nonEmpty && !threwException) {
            val errMsg =
              s "${ releasedLocks.size } block locks were not released by TID = $taskId:\n " +
                releasedLocks.mkString ("[" , ", " , "]" )
            if (conf.getBoolean("spark.storage.exceptionOnPinLeak" , false)) {
              throw new SparkException( errMsg)
            } else {
              logInfo(errMsg)
            }
          }
        }
        task.context.fetchFailed.foreach { fetchFailure =>
          // uh -oh.  it appears the user code has caught the fetch-failure without throwing any
          // other exceptions.  Its *possible* this is what the user meant to do (though highly
          // unlikely).  So we will log an error and keep going.
          logError(s "TID ${taskId} completed successfully though internally it encountered " +
            s "unrecoverable fetch failures!  Most likely this means user code is incorrectly " +
            s "swallowing Spark's internal ${classOf [FetchFailedException]}", fetchFailure)
        }
        val taskFinish = System.currentTimeMillis()
        val taskFinishCpu = if (threadMXBean. isCurrentThreadCpuTimeSupported) {
          threadMXBean.getCurrentThreadCpuTime
        } else 0L

        // If the task has been killed, let's fail it.
        task.context.killTaskIfInterrupted()

        // 对MapStatus进行各种序列化和封装，通过网络发送给Driver
        val resultSer = env.serializer.newInstance()
        val beforeSerialization = System.currentTimeMillis()
        val valueBytes = resultSer .serialize (value)
        val afterSerialization = System.currentTimeMillis()

        // Deserialization happens in two parts: first, we deserialize a Task object, which
        // includes the Partition. Second, Task.run() deserializes the RDD and function to be run.
        // 计算出Task相关的统计信息Metrics（显示在SparkUI->4040端口）：ExecutorDeserializeTime / ExecutorDeserializeCpuTime / ExecutorRunTime /  ExecutorCpuTime / JvmGCTime / ResultSerializationTime 
        task.metrics.setExecutorDeserializeTime(
          (taskStart - deserializeStartTime) + task.executorDeserializeTime)
        task.metrics.setExecutorDeserializeCpuTime(
          (taskStartCpu - deserializeStartCpuTime) + task.executorDeserializeCpuTime)
        // We need to subtract Task.run()'s deserialization time to avoid double-counting
        task.metrics.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime)
        task.metrics.setExecutorCpuTime(
          (taskFinishCpu - taskStartCpu) - task.executorDeserializeCpuTime)
        task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
        task.metrics.setResultSerializationTime(afterSerialization - beforeSerialization)

        // Expose task metrics using the Dropwizard metrics system.
        // Update task metrics counters
        executorSource.METRIC_CPU_TIME.inc(task.metrics.executorCpuTime)
        executorSource.METRIC_RUN_TIME.inc(task.metrics.executorRunTime)
        executorSource.METRIC_JVM_GC_TIME.inc(task.metrics.jvmGCTime)
        executorSource.METRIC_DESERIALIZE_TIME.inc(task.metrics.executorDeserializeTime)
        executorSource.METRIC_DESERIALIZE_CPU_TIME.inc(task.metrics.executorDeserializeCpuTime)
        executorSource.METRIC_RESULT_SERIALIZE_TIME.inc(task.metrics.resultSerializationTime)
        executorSource.METRIC_SHUFFLE_FETCH_WAIT_TIME
          .inc(task.metrics.shuffleReadMetrics.fetchWaitTime)
        executorSource.METRIC_SHUFFLE_WRITE_TIME.inc(task.metrics.shuffleWriteMetrics.writeTime)
        executorSource.METRIC_SHUFFLE_TOTAL_BYTES_READ
          .inc(task.metrics.shuffleReadMetrics.totalBytesRead)
        executorSource.METRIC_SHUFFLE_REMOTE_BYTES_READ
          .inc(task.metrics.shuffleReadMetrics.remoteBytesRead)
        executorSource.METRIC_SHUFFLE_REMOTE_BYTES_READ_TO_DISK
          .inc(task.metrics.shuffleReadMetrics.remoteBytesReadToDisk)
        executorSource.METRIC_SHUFFLE_LOCAL_BYTES_READ
          .inc(task.metrics.shuffleReadMetrics.localBytesRead)
        executorSource.METRIC_SHUFFLE_RECORDS_READ
          .inc(task.metrics.shuffleReadMetrics.recordsRead)
        executorSource.METRIC_SHUFFLE_REMOTE_BLOCKS_FETCHED
          .inc(task.metrics.shuffleReadMetrics.remoteBlocksFetched)
        executorSource.METRIC_SHUFFLE_LOCAL_BLOCKS_FETCHED
          .inc(task.metrics.shuffleReadMetrics.localBlocksFetched)
        executorSource.METRIC_SHUFFLE_BYTES_WRITTEN
          .inc(task.metrics.shuffleWriteMetrics.bytesWritten)
        executorSource.METRIC_SHUFFLE_RECORDS_WRITTEN
          .inc(task.metrics.shuffleWriteMetrics.recordsWritten)
        executorSource.METRIC_INPUT_BYTES_READ
          .inc(task.metrics.inputMetrics.bytesRead)
        executorSource.METRIC_INPUT_RECORDS_READ
          .inc(task.metrics.inputMetrics.recordsRead)
        executorSource.METRIC_OUTPUT_BYTES_WRITTEN
          .inc(task.metrics.outputMetrics.bytesWritten)
        executorSource.METRIC_OUTPUT_RECORDS_WRITTEN
          .inc(task.metrics.inputMetrics.recordsRead)
        executorSource.METRIC_RESULT_SIZE.inc(task.metrics.resultSize)
        executorSource.METRIC_DISK_BYTES_SPILLED.inc(task.metrics.diskBytesSpilled)
        executorSource.METRIC_MEMORY_BYTES_SPILLED.inc(task.metrics.memoryBytesSpilled)

        // Note: accumulator updates must be collected after TaskMetrics is updated
        val accumUpdates = task.collectAccumulatorUpdates()
        // TODO: do not serialize value twice
        val directResult = new DirectTaskResult (valueBytes , accumUpdates )
        val serializedDirectResult = ser .serialize (directResult)
        val resultSize = serializedDirectResult .limit ()

        // directSend = sending directly back to the driver
        val serializedResult : ByteBuffer = {
          if (maxResultSize > 0 && resultSize > maxResultSize) {
            logWarning(s"Finished $taskName (TID $taskId ). Result is larger than maxResultSize " +
              s "(${ Utils.bytesToString( resultSize)} > ${Utils .bytesToString (maxResultSize)}), " +
              s "dropping it.")
            ser. serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))
          } else if (resultSize > maxDirectResultSize) {
            val blockId = TaskResultBlockId (taskId)
            env.blockManager.putBytes(
              blockId,
              new ChunkedByteBuffer(serializedDirectResult.duplicate()),
              StorageLevel.MEMORY_AND_DISK_SER)
            logInfo(
              s "Finished $taskName (TID $ taskId). $ resultSize bytes result sent via BlockManager)")
            ser. serialize(new IndirectTaskResult[Any](blockId, resultSize))
          } else {
            logInfo(s"Finished $taskName (TID $taskId ). $resultSize bytes result sent to driver")
            serializedDirectResult
          }
        }

        setTaskFinishedAndClearInterruptStatus()
        // KEY 调用Executor所在的CoarseGrainedExecutorBackend.statusUpdate发送MapStauts
        execBackend.statusUpdate (taskId, TaskState. FINISHED, serializedResult)

      } catch {
        case t : Throwable if hasFetchFailure && !Utils. isFatalError(t ) =>
          val reason = task .context .fetchFailed .get .toTaskFailedReason
          if (!t .isInstanceOf [FetchFailedException ]) {
            // there was a fetch failure in the task, but some user code wrapped that exception
            // and threw something else.  Regardless, we treat it as a fetch failure.
            val fetchFailedCls = classOf [FetchFailedException ].getName
            logWarning(s"TID ${taskId} encountered a ${fetchFailedCls } and " +
              s "failed, but the ${fetchFailedCls } was hidden by another " +
              s "exception.  Spark is handling this like a fetch failure and ignoring the " +
              s "other exception: $t" )
          }
          setTaskFinishedAndClearInterruptStatus()
          execBackend.statusUpdate (taskId, TaskState. FAILED, ser.serialize (reason))

        case t : TaskKilledException =>
          logInfo(s"Executor killed $taskName (TID $taskId ), reason: ${t .reason}")
          setTaskFinishedAndClearInterruptStatus()
          execBackend.statusUpdate (taskId, TaskState. KILLED, ser.serialize (TaskKilled(t.reason)))

        case _: InterruptedException | NonFatal (_) if
            task != null && task.reasonIfKilled.isDefined =>
          val killReason = task.reasonIfKilled.getOrElse("unknown reason")
          logInfo(s"Executor interrupted and killed $taskName (TID $taskId ), reason: $killReason")
          setTaskFinishedAndClearInterruptStatus()
          execBackend.statusUpdate (
            taskId, TaskState.KILLED, ser.serialize(TaskKilled(killReason)))

        case CausedBy( cDE: CommitDeniedException) =>
          val reason = cDE .toTaskCommitDeniedReason
          setTaskFinishedAndClearInterruptStatus()
          execBackend.statusUpdate (taskId, TaskState. KILLED, ser.serialize (reason))

        case t : Throwable =>
          // Attempt to exit cleanly by informing the driver of our failure.
          // If anything goes wrong (or this was a fatal exception), we will delegate to
          // the default uncaught exception handler, which will terminate the Executor.
          logError(s"Exception in $taskName (TID $taskId )", t)

          // SPARK-20904: Do not report failure to driver if if happened during shut down. Because
          // libraries may set up shutdown hooks that race with running tasks during shutdown,
          // spurious failures may occur and can result in improper accounting in the driver (e.g.
          // the task failure would not be ignored if the shutdown happened because of premption,
          // instead of an app issue).
          if (!ShutdownHookManager. inShutdown()) {
            // Collect latest accumulator values to report back to the driver
            val accums : Seq[AccumulatorV2 [_, _]] =
              if (task != null) {
                task.metrics.setExecutorRunTime(System.currentTimeMillis() - taskStart)
                task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
                task.collectAccumulatorUpdates( taskFailed = true )
              } else {
                Seq. empty
              }

            val accUpdates = accums .map (acc => acc .toInfo (Some(acc.value), None))

            val serializedTaskEndReason = {
              try {
                ser. serialize(new ExceptionFailure(t, accUpdates).withAccums(accums))
              } catch {
                case _: NotSerializableException =>
                  // t is not serializable so just send the stacktrace
                  ser. serialize(new ExceptionFailure(t, accUpdates, false).withAccums(accums))
              }
            }
            setTaskFinishedAndClearInterruptStatus()
            execBackend.statusUpdate (taskId, TaskState. FAILED, serializedTaskEndReason)
          } else {
            logInfo("Not reporting error to driver during JVM shutdown.")
          }

          // Don't forcibly exit unless the exception was inherently fatal, to avoid
          // stopping other tasks unnecessarily.
          if (!t .isInstanceOf [SparkOutOfMemoryError ] && Utils. isFatalError(t )) {
            uncaughtExceptionHandler.uncaughtException (Thread.currentThread (), t )
          }
      } finally {
        runningTasks.remove (taskId )
      }
    }
   

 /**
   * Download any missing dependencies if we receive a new set of files and JARs from the
   * SparkContext. Also adds any new JARs we fetched to the class loader.
   */
  private def updateDependencies (newFiles : Map [String, Long], newJars: Map[String, Long]) {
    // 获取Hadoop配置文件
    lazy val hadoopConf = SparkHadoopUtil. get. newConfiguration(conf)

    // Java同步块，解决共享资源如currentFiles访问的线程安全
    // Task实际上是以Java线程的方式，在一个CoarseExecutorBackend线程内并发运行
    synchronized {
      // Fetch missing dependencies 
      // 遍历要拉取的文件
      for (( name, timestamp) <- newFiles if currentFiles. getOrElse(name , -1L) < timestamp) {
        logInfo("Fetching " + name + " with timestamp " + timestamp )
        // Fetch file with useCache mode, close cache for local mode.
        // 通过网络通信从远程拉取文件
        Utils.fetchFile (name , new File(SparkFiles .getRootDirectory ()), conf,
          env.securityManager, hadoopConf, timestamp , useCache = !isLocal)
        currentFiles(name ) = timestamp
      }
      // 遍历要拉取的Jars
      for (( name, timestamp) <- newJars ) {
        val localName = new URI(name). getPath.split ("/"). last
        val currentTimeStamp = currentJars .get (name )
          . orElse( currentJars.get (localName))
          . getOrElse(-1L )
        if (currentTimeStamp < timestamp ) {
          logInfo("Fetching " + name + " with timestamp " + timestamp )
          // Fetch file with useCache mode, close cache for local mode.
          Utils.fetchFile(name, new File(SparkFiles.getRootDirectory ()), conf,
            env.securityManager, hadoopConf, timestamp , useCache = !isLocal)
          currentJars(name ) = timestamp
          // Add it to our class loader
          val url = new File(SparkFiles .getRootDirectory (), localName ).toURI .toURL
          if (!urlClassLoader.getURLs().contains (url)) {
            logInfo("Adding " + url + " to class loader")
            urlClassLoader.addURL(url)
          }
        }
      }
    }
  }

      //... ...
    }

  /**
   * Called by [[org.apache.spark.executor.Executor]] to run this task.
   *
   * @param taskAttemptId an identifier for this task attempt that is unique within a SparkContext.
   * @param attemptNumber how many times this task has been attempted (0 for the first attempt)
   * @return the result of the task along with updates of Accumulators.
   */
  final def run (
      taskAttemptId: Long,
      attemptNumber: Int,
      metricsSystem: MetricsSystem): T = {
    SparkEnv.get .blockManager .registerTask (taskAttemptId )
    //创建执行上下文TaskContext
    context = new TaskContextImpl(
      stageId,
      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
      partitionId,
      taskAttemptId,
      attemptNumber,
      taskMemoryManager,
      localProperties,
      metricsSystem,
      metrics)
    TaskContext.setTaskContext (context)
    taskThread = Thread. currentThread()

    if (_reasonIfKilled != null) {
      kill(interruptThread = false, _reasonIfKilled )
    }

    new CallerContext(
      "TASK",
      SparkEnv.get .conf .get (APP_CALLER_CONTEXT ),
      appId,
      appAttemptId,
      jobId,
      Option(stageId),
      Option(stageAttemptId),
      Option( taskAttemptId),
      Option( attemptNumber)).setCurrentContext ()

    try {
      // Key!!! 调用抽象方法runTask,子类实现ShuffleMapTask / ResultTask
      runTask(context)
    } catch {
      case e: Throwable =>
        // Catch all errors; run task failure callbacks, and rethrow the exception.
        try {
          context.markTaskFailed(e)
        } catch {
          case t : Throwable =>
            e. addSuppressed(t )
        }
        context.markTaskCompleted(Some(e))
        throw e
    } finally {
      try {
        // Call the task completion callbacks. If "markTaskCompleted" is called twice, the second
        // one is no-op .
        context.markTaskCompleted(None)
      } finally {
        try {
          Utils.tryLogNonFatalError {
            // Release memory used by this thread for unrolling blocks
            SparkEnv.get.blockManager. memoryStore.releaseUnrollMemoryForThisTask (MemoryMode. ON_HEAP)
            SparkEnv.get.blockManager. memoryStore.releaseUnrollMemoryForThisTask (
              MemoryMode.OFF_HEAP)
            // Notify any tasks waiting for execution memory to be freed to wake up and try to
            // acquire memory again. This makes impossible the scenario where a task sleeps forever
            // because there are no other tasks left to notify it. Since this is safe to do but may
            // not be strictly necessary, we should revisit whether we can remove this in the
            // future.
            val memoryManager = SparkEnv. get. memoryManager
            memoryManager.synchronized { memoryManager.notifyAll() }
          }
        } finally {
          // Though we unset the ThreadLocal here, the context member variable itself is still
          // queried directly in the TaskRunner to check for FetchFailedExceptions.
          TaskContext.unset()
        }
      }
    }
  }

/**
 * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
 * specified in the ShuffleDependency. Default is HashPartitioner).
 *
 * See [[org.apache.spark.scheduler.Task]] for more information.
 *
 * @param stageId id of the stage this task belongs to
 * @param stageAttemptId attempt id of the stage this task belongs to
 * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized,
 *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
 * @param partition partition of the RDD this task is associated with
 * @param locs preferred task execution locations for locality scheduling
 * @param localProperties copy of thread-local properties set by the user on the driver side.
 * @param serializedTaskMetrics a `TaskMetrics` that is created and serialized on the driver side
 *                              and sent to executor side.
 *
 * The parameters below are optional:
 * @param jobId id of the job this task belongs to
 * @param appId id of the app this task belongs to
 * @param appAttemptId attempt id of the app this task belongs to
 */
private[spark] class ShuffleMapTask(
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient private var locs : Seq[TaskLocation],
    localProperties: Properties,
    serializedTaskMetrics: Array[Byte],
    jobId: Option[Int] = None,
    appId: Option[ String] = None ,
    appAttemptId: Option[ String] = None )
  extends Task[ MapStatus](stageId , stageAttemptId , partition .index , localProperties,
    serializedTaskMetrics, jobId, appId, appAttemptId)
  with Logging {
    //......
  override def runTask (context : TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    // 多个Task并行/并发运行在多个Executor中，可能都不在一个节点，但一个Stage的Task面对的RDD是一样的
    // 因此，将通过Broadcast variable直接读出RDD自己处理的部分
    val threadMXBean = ManagementFactory. getThreadMXBean
    val deserializeStartTime = System. currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported ) {
      threadMXBean. getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get .closureSerializer .newInstance ()
    val ( rdd, dep) = ser. deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System. currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported ) {
      threadMXBean. getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    var writer: ShuffleWriter[Any, Any] = null
    try {
      // 从ShufflerManager中获取ShuffleWriter
      val manager = SparkEnv.get .shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      // Key 传入当前Task要处理的partition，核心逻辑就在RDD的iterator()中针对RDD的某partition执行自定义算子
      // 返回的数据都是通过ShuffleWriter，经过HashPartitioner进行分区后写入对应的分区bucket
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
    
      // 最后，返回MapStatus，其中封装了ShuffleMapTask计算后的数据，存储在BlockManager的相关信息
      writer.stop(success = true). get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer. stop( success = false)
          }
        } catch {
          case e : Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }
    //......
}

// ResultTask
  override def runTask (context : TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory. getThreadMXBean
    val deserializeStartTime = System. currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported ) {
      threadMXBean. getCurrentThreadCpuTime
    } else 0L
    //基本的反序列化
    val ser = SparkEnv.get .closureSerializer .newInstance ()
    val ( rdd, func) = ser. deserialize[(RDD[T ], (TaskContext, Iterator[ T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System. currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported ) {
      threadMXBean. getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    //通过RDD的iterator执行自定义的算子
    func(context, rdd.iterator(partition, context))
  }

//RDD
  /**
   * Internal method to this RDD; will read from cache if applicable, or otherwise compute it.
   * This should ''not'' be called by users directly, but is available for implementors of custom
   * subclasses of RDD.
   */
  final def iterator (split : Partition, context: TaskContext): Iterator[ T] = {
    if (storageLevel != StorageLevel.NONE) {
      getOrCompute( split, context)
    } else {
      computeOrReadCheckpoint( split, context)
    }
  }

  /**
   * Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing.
   */
  private[spark] def computeOrReadCheckpoint (split : Partition, context: TaskContext): Iterator[ T] =
  {
    if (isCheckpointedAndMaterialized) {
      firstParent[ T]. iterator( split, context)
    } else {
      //又是抽象方法->MapPartitionsRDD
      compute(split, context)
    }
  }

//MapPartitionsRDD
  /**
   * f可以理解为自定义算子函数，但是Spark内部的封装还实现了其他逻辑
   * 调用到这里为止，其实就是针对RDD的partition执行计算操作，返回新的RDD的partition iterator
   */
  override def compute (split : Partition, context: TaskContext): Iterator[ U] =
    f(context, split.index, firstParent[ T]. iterator( split, context))

// CoarseGrainedExecutorBackend  
overridedefstatusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {
    val msg = StatusUpdate( executorId, taskId , state , data )
    driver match {
      case Some(driverRef ) => driverRef .send (msg )
      case None => logWarning(s "Drop $msg because has not yet connected to driver")
    }
  }

// CoarseGrainedSchedulerBackend
      case StatusUpdate(executorId , taskId , state , data ) =>
        scheduler.statusUpdate (taskId , state , data .value )
        if (TaskState. isFinished(state )) {
          executorDataMap.get (executorId ) match {
            case Some (executorInfo ) =>
              executorInfo.freeCores += scheduler.CPUS_PER_TASK
              makeOffers(executorId )
            case None =>
              // Ignoring the update since we don't know about the executor.
              logWarning(s"Ignored task status update ($taskId state $state) " +
                s "from unknown executor with ID $executorId" )
          }
        }

// TaskScheduler
  def statusUpdate( tid: Long, state: TaskState, serializedData: ByteBuffer) {
    var failedExecutor: Option[ String] = None
    var reason: Option[ExecutorLossReason] = None
    synchronized {
      try {
        taskIdToTaskSetManager.get (tid ) match {
          case Some (taskSet ) =>
            //实际编写Spark应用时可能经常发现Task lost，因为各种各样的原因执行失败
            if (state == TaskState.LOST) {
              // TaskState.LOST is only used by the deprecated Mesos fine-grained scheduling mode,
              // where each executor corresponds to a single task, so mark the executor as failed.
              val execId = taskIdToExecutorId .getOrElse (tid , throw new IllegalStateException(
                "taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)"))
              if (executorIdToRunningTaskIds .contains (execId )) {
                reason = Some(
                  SlaveLost(s"Task $ tid was lost, so marking the executor as lost as well."))
                removeExecutor(execId , reason .get )
                failedExecutor = Some (execId )
              }
            }
            // 如果Task完成，从内存缓存中移除
            if (TaskState. isFinished(state )) {
              cleanupTaskState(tid )
              taskSet. removeRunningTask(tid )
            // 相应处理
              if (state == TaskState.FINISHED) {
                taskResultGetter.enqueueSuccessfulTask (taskSet , tid , serializedData)
              } else if (Set(TaskState. FAILED, TaskState.KILLED, TaskState.LOST).contains( state)) {
                taskResultGetter.enqueueFailedTask (taskSet , tid , state , serializedData)
              }
            }
          case None =>
            logError(
              ( "Ignoring update with state %s for TID %s because its task set is gone (this is " +
                "likely the result of receiving duplicate task finished status updates) or its " +
                "executor has been marked as failed." )
                .format( state, tid))
        }
      } catch {
        case e : Exception => logError("Exception in statusUpdate" , e )
      }
    }
    // Update the DAGScheduler without holding a lock on this, since that can deadlock
    if (failedExecutor. isDefined) {
      assert(reason.isDefined)
      dagScheduler. executorLost(failedExecutor .get , reason .get )
      backend.reviveOffers()
    }
  }

你可能感兴趣的:(bigdata,spark)

Hive 查看partition 以及msck 修复分区 dgsdaga3026010 大数据
#checktable的partitionhive>showpartitionstable_name;如果是外部表，不小心把表给删除了，可以适用下命令重新关联表和数据[MSCKREPAIRTABLE]全量修复分区hive>msckrepairtabletable_name;转载于:https://www.cnblogs.com/TendToBigData/p/10501178.html
大数据-257 离线数仓 - 数据质量监控监控方法 Griffin架构武子康大数据离线数仓大数据数据仓库 java 后端 hadoop hive
点一下关注吧！！！非常感谢！！持续更新！！！Java篇开始了！目前开始更新MyBatis，一起深入浅出！目前已经更新到了：Hadoop（已更完）HDFS（已更完）MapReduce（已更完）Hive（已更完）Flume（已更完）Sqoop（已更完）Zookeeper（已更完）HBase（已更完）Redis（已更完）Kafka（已更完）Spark（已更完）Flink（已更完）ClickHouse（已
pyspark 中删除hdfs的文件夹 TDengine （老段）大数据 spark hadoop hdfs mapreduce
在pyspark中保存rdd的内存到文件的时候，会遇到文件夹已经存在而失败，所以如果文件夹已经存在，需要先删除。搜索了下资料，发现pyspark并没有提供直接管理hdfs文件系统的功能。寻找到一个删除的方法，是通过调用shell命令hadoopfs-rm-f来删除，这个方法感觉不怎么好，所以继续找。后来通过查找hadoophdfs的源代码发现hdfs是通过java的包org.appache.had
Python 爬虫：获取网页数据的 5 种方法王子良. 经验分享 python python 开发语言爬虫
欢迎来到我的博客！非常高兴能在这里与您相遇。在这里，您不仅能获得有趣的技术分享，还能感受到轻松愉快的氛围。无论您是编程新手，还是资深开发者，都能在这里找到属于您的知识宝藏，学习和成长。博客内容包括：Java核心技术与微服务：涵盖Java基础、JVM、并发编程、Redis、Kafka、Spring等，帮助您全面掌握企业级开发技术。大数据技术：涵盖Hadoop（HDFS）、Hive、Spark、Fli
python捕获异常青云游子 python
try:name="aaa"id="aaa"exceptExceptionase:print("任务报错")print(str(e))print(str(traceback.print_exc()))spark.sql("""insertintotabledim.aaaselect'1','666','{name}','{id}',null,null,null,null,current_times
Spark任务提交流程尘世壹俗人大数据Spark技术大数据
当包含在applicationmaster中的spark-driver启动后，会与资源调度平台交互获取其他执行器资源，并通过反向注册通知对应的node节点启动执行容器。此外，还会根据程序的执行规划生成两个非常重要的东西，一个是根据spark任务执行计划生成n个ADG有向无环图，另一个是根据有向无环图生成对应的taskset，也可以统称为stage，ADG和taskset由于宽窄依赖以及程序的复杂度
spark读取、写入Clickhouse以及遇到的问题 Alex_81D 大数据基础大数据从入门到精通 clickhouse spark
最近需要处理Clickhouse里面的数据，经过上网查找总结一下spark读写Clickhouse的工具类已经遇到的问题点。具体Clickhouse的讲解本篇不做讲解，后面专门讲解这个。一、clickhouse代码操作话不多说直接看代码1.引入依赖：ru.yandex.clickhouseclickhouse-jdbc0.2.40.2.4这个版本用的比较多一点2.spark对象创建valspark
2024年最新Python：Page Object设计模式_python page object，BTAJ大厂最新面试题汇集 m0_60707708 程序员 python 设计模式开发语言
最后硬核资料：关注即可领取PPT模板、简历模板、行业经典书籍PDF。技术互助：技术群大佬指点迷津，你的问题可能不是问题，求资源在群里喊一声。面试题库：由技术群里的小伙伴们共同投稿，热乎的大厂面试真题，持续更新中。知识体系：含编程语言、算法、大数据生态圈组件（Mysql、Hive、Spark、Flink）、数据仓库、Python、前端等等。网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是
2024年总结：大转向年度总结
本文于2025年1月2号首发于公众号“狗哥琐话”。2024年是个打工人苦命年，我看到几乎每个人都比以往辛苦。这让我想起了六字真言，钱难赚屎难吃。职业转向今年我在职业上尝试做了一个转向，具体的结果可能需要比较长的时间来检验我选择是否正确，所以转向的细节我就不全部展开了，可以确定是我依然会专注在Infra和BigData，比如今年我发布了SparkSQL和FlinkSQL的IDEA提效插件。那么我为什
Java爬虫——使用Spark进行数据清晰 Future_yzx java 爬虫 spark
1.依赖引入 org.apache.spark spark-core_2.13 3.5.3 org.apache.spark spark-sql_2.13 3.5.32.数据加载从MySQL数据库中加载jobTest表中的数据，使用Spark的JDBC功能连接到数据库。代码片段：//数据库连接信息StringjdbcUrl="jdbc:mysql://82.157.185.251:3306/
官宣开源阿里云与清华大学共建AI大模型推理项目Mooncake 阿里云大模型
2024年6月，国内优质大模型应用月之暗面Kimi与清华大学MADSys实验室（MachineLearning,AI,BigDataSystemsLab）联合发布了以KVCache为中心的大模型推理架构Mooncake。通过使用以KVCache为中心的PD分离和以存换算架构，大幅提升大模型应用Kimi智能助手推理吞吐的同时有效降低了推理成本，自发布以来受到业界广泛关注。近日，清华大学和研究组织9#
万字详解数仓分层设计架构 ODS-DWD-DWS-ADS _Jordan 自己写的数据仓库
参考：万字详解数仓分层设计架构ODS-DWD-DWS-ADS数据分层的意义1、清晰数据结构2、数据血缘追踪3、数据复用，减少重复开发4、把复杂问题简单化5、屏蔽原始数据的(影响)，屏蔽业务的影响ETL操作1、数据抽取2、数据清洗3、数据转换4、数据加载数据中台包含的内容很多，对应到具体工作中的话，它可以包含下面的这些内容：系统架构：以Hadoop、Spark等组件为中心的架构体系数据架构：顶层设计
Java 大视界 -- Java 开发 Spark 应用：RDD 操作与数据转换一只蜗牛儿 java spark 开发语言
ApacheSpark是一个强大的分布式计算框架，提供了高效的数据处理能力，广泛应用于大数据分析与机器学习。Spark提供了多种高级API，支持批处理和流处理。Spark提供了两种主要的数据抽象：RDD（弹性分布式数据集）和DataFrame。本文将重点介绍如何使用Java开发Spark应用，并深入探讨RDD的操作与数据转换。一、Spark环境搭建首先，确保您的环境中安装了Java和Spark。您
Spring Boot 和微服务：快速入门指南王子良. Java 经验分享 spring boot 微服务后端
欢迎来到我的博客！非常高兴能在这里与您相遇。在这里，您不仅能获得有趣的技术分享，还能感受到轻松愉快的氛围。无论您是编程新手，还是资深开发者，都能在这里找到属于您的知识宝藏，学习和成长。博客内容包括：Java核心技术与微服务：涵盖Java基础、JVM、并发编程、Redis、Kafka、Spring等，帮助您全面掌握企业级开发技术。大数据技术：涵盖Hadoop（HDFS）、Hive、Spark、Fli
CDP中的Hive3之Hive Metastore（HMS）对许 #Hive #Spark hive cdp
CDP中的Hive3之HiveMetastore（HMS）1、CDP中的HMS2、HMS表的存储（转换）3、HWC授权1、CDP中的HMSCDP中的HiveMetastore（HMS）是一种服务，用于在后端RDBMS（例如MySQL或PostgreSQL）中存储与ApacheHive和其他服务相关的元数据。Impala、Spark、Hive和其他服务共享元存储。与HMS的连接包括HiveServe
【YashanDB知识库】Hive 命令工具insert崖山数据库报错数据库
本文内容来自YashanDB官网，原文内容请见https://www.yashandb.com/newsinfo/7919217.html?templateId=171...【问题分类】功能兼容【关键字】spark30041、不兼容【问题描述】本项目的架构是hadoop+hive+yashandb使用崖山数据库，初始化所有的原数据表和数据新建表之后，插入数据时候报错，hadoopcode30041
初学者如何用 Python 写第一个爬虫？王子良. python 经验分享 python 开发语言爬虫
欢迎来到我的博客！非常高兴能在这里与您相遇。在这里，您不仅能获得有趣的技术分享，还能感受到轻松愉快的氛围。无论您是编程新手，还是资深开发者，都能在这里找到属于您的知识宝藏，学习和成长。博客内容包括：Java核心技术与微服务：涵盖Java基础、JVM、并发编程、Redis、Kafka、Spring等，帮助您全面掌握企业级开发技术。大数据技术：涵盖Hadoop（HDFS）、Hive、Spark、Fli
Apache PAIMON 学习潇锐killer 学习
参考：ApachePAIMON：实时数据湖技术框架及其实践数据湖不仅仅是一个存储不同类数据的技术手段，更是提高数据分析效率、支持数据驱动决策、加速AI发展的基础设施。新一代实时数据湖技术，ApachePAIMON兼容ApacheFlink、Spark等主流计算引擎，并支持流批一体化处理、快速查询和性能优化，成为加速AI转型的重要工具。ApachePAIMON是一个支持大规模实时数据更新的存储和分析
应急救援路径规划中的蚁群算法与路径评价研究【附代码】拉勾科研工作室算法
数据科学与大数据专业|数据分析与模型构建|数据驱动决策✨专业领域：数据挖掘与清洗大数据处理与存储技术机器学习与深度学习模型数据可视化与报告生成分布式计算与云计算数据安全与隐私保护擅长工具：Python/R/Matlab数据分析与建模Hadoop/Spark大数据处理平台SQL数据库管理与优化Tableau/PowerBI数据可视化工具TensorFlow/PyTorch深度学习框架✅具体问题可以私
Java 大视界 -- Java 开发 Spark 应用：RDD 操作与数据转换（四）青云交大数据新视界 Java 大视界 Spark RDD 数据转换大数据数据分区性能优化社交网络 java
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：大数
大数据新视界 --大数据大厂之 Spark Streaming 实时数据处理框架：案例与实践青云交大数据新视界 #Spark 之道 Spark Streaming 大数据新视界实时数据处理案例分析实践技巧框架比较应用场景
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：大数
nosql数据库技术与应用知识点皆过客，揽星河 NoSQL nosql 数据库大数据数据分析数据结构非关系型数据库
Nosql知识回顾大数据处理流程数据采集(flume、爬虫、传感器)数据存储(本门课程NoSQL所处的阶段)Hdfs、MongoDB、HBase等数据清洗(入仓)Hive等数据处理、分析(Spark、Flink等)数据可视化数据挖掘、机器学习应用(Python、SparkMLlib等)大数据时代存储的挑战(三高)高并发(同一时间很多人访问)高扩展(要求随时根据需求扩展存储)高效率(要求读写速度快)
Hadoop架构 henan程序媛 hadoop 大数据分布式
一、案列分析1.1案例概述现在已经进入了大数据(BigData)时代，数以万计用户的互联网服务时时刻刻都在产生大量的交互，要处理的数据量实在是太大了，以传统的数据库技术等其他手段根本无法应对数据处理的实时性、有效性的需求。HDFS顺应时代出现，在解决大数据存储和计算方面有很多的优势。1.2案列前置知识点1.什么是大数据大数据是指无法在一定时间范围内用常规软件工具进行捕捉、管理和处理的大量数据集合，
分享一个基于python的电子书数据采集与可视化分析 hadoop电子书数据分析与推荐系统 spark大数据毕设项目（源码、调试、LW、开题、PPT) 计算机源码社 Python项目大数据大数据 python hadoop 计算机毕业设计选题计算机毕业设计源码数据分析 spark毕设
作者：计算机源码社个人简介：本人八年开发经验，擅长Java、Python、PHP、.NET、Node.js、Android、微信小程序、爬虫、大数据、机器学习等，大家有这一块的问题可以一起交流！学习资料、程序开发、技术解答、文档报告如需要源码，可以扫取文章下方二维码联系咨询Java项目微信小程序项目Android项目Python项目PHP项目ASP.NET项目Node.js项目选题推荐项目实战|p
Spark 组件 GraphX、Streaming 叶域大数据 spark spark 大数据分布式
Spark组件GraphX、Streaming一、SparkGraphX1.1GraphX的主要概念1.2GraphX的核心操作1.3示例代码1.4GraphX的应用场景二、SparkStreaming2.1SparkStreaming的主要概念2.2示例代码2.3SparkStreaming的集成2.4SparkStreaming的应用场景SparkGraphX用于处理图和图并行计算。Graph
大数据毕业设计hadoop+spark+hive知识图谱租房数据分析可视化大屏租房推荐系统 58同城租房爬虫房源推荐系统房价预测系统计算机毕业设计机器学习深度学习人工智能 2401_84572577 程序员大数据 hadoop 人工智能
做了那么多年开发，自学了很多门编程语言，我很明白学习资源对于学一门新语言的重要性，这些年也收藏了不少的Python干货，对我来说这些东西确实已经用不到了，但对于准备自学Python的人来说，或许它就是一个宝藏，可以给你省去很多的时间和精力。别在网上瞎学了，我最近也做了一些资源的更新，只要你是我的粉丝，这期福利你都可拿走。我先来介绍一下这些东西怎么用，文末抱走。（1）Python所有方向的学习路线（
Spark集群的三种模式 MelodyYN #Spark spark hadoop big data
文章目录1、Spark的由来1.1Hadoop的发展1.2MapReduce与Spark对比2、Spark内置模块3、Spark运行模式3.1Standalone模式部署配置历史服务器配置高可用运行模式3.2Yarn模式安装部署配置历史服务器运行模式4、WordCount案例1、Spark的由来定义：Hadoop主要解决，海量数据的存储和海量数据的分析计算。Spark是一种基于内存的快速、通用、可
Java中的大数据处理框架对比分析省赚客app开发者 java 开发语言
Java中的大数据处理框架对比分析大家好，我是微赚淘客系统3.0的小编，是个冬天不穿秋裤，天冷也要风度的程序猿！今天，我们将深入探讨Java中常用的大数据处理框架，并对它们进行对比分析。大数据处理框架是现代数据驱动应用的核心，它们帮助企业处理和分析海量数据，以提取有价值的信息。本文将重点介绍ApacheHadoop、ApacheSpark、ApacheFlink和ApacheStorm这四种流行的
写出渗透测试信息收集详细流程卿酌南烛_b805
一、扫描域名漏洞：域名漏洞扫描工具有AWVS、APPSCAN、Netspark、WebInspect、Nmap、Nessus、天镜、明鉴、WVSS、RSAS等。二、子域名探测：1、dns域传送漏洞2、搜索引擎查找（通过Google、bing、搜索c段）3、通过ssl证书查询网站：https://myssl.com/ssl.html和https://www.chinassl.net/ssltools
echarts象形渐变柱状图星星跌入梦境* echarts angular.js 前端
一、效果图如下：二、代码如下（1）父组件importitemfrom'../bigdata/components/item.vue'exportdefault{components:{item}}.page-con{width:100%;height:100%;.main-con{width:35%;height:33%;}}（2）子组件importechartsfrom"echarts";exp
312个免费高速HTTP代理IP（能隐藏自己真实IP地址） yangshangchuan 高速免费 superword HTTP代理
124.88.67.20:843 190.36.223.93:8080 117.147.221.38:8123 122.228.92.103:3128 183.247.211.159:8123 124.88.67.35:81 112.18.51.167:8123 218.28.96.39:3128 49.94.160.198:3128 183.20
pull解析和json编码百合不是茶 android pull解析 json
n.json文件: [{name:java,lan:c++,age:17},{name:android,lan:java,age:8}] pull.xml文件 <?xml version="1.0" encoding="utf-8"?> <stu> <name>java
[能源与矿产]石油与地球生态系统 comsci 能源
按照苏联的科学界的说法,石油并非是远古的生物残骸的演变产物,而是一种可以由某些特殊地质结构和物理条件生产出来的东西,也就是说,石油是可以自增长的.... 那么我们做一个猜想: 石油好像是地球的体液,我们地球具有自动产生石油的某种机制,只要我们不过量开采石油,并保护好
类与对象浅谈沐刃青蛟 java 基础
类，字面理解，便是同一种事物的总称，比如人类，是对世界上所有人的一个总称。而对象，便是类的具体化，实例化，是一个具体事物，比如张飞这个人，就是人类的一个对象。但要注意的是：张飞这个人是对象，而不是张飞，张飞只是他这个人的名字，是他的属性而已。而一个类中包含了属性和方法这两兄弟，他们分别用来描述对象的行为和性质（感觉应该是
新站开始被收录后，我们应该做什么？ IT独行者 PHP seo
新站开始被收录后，我们应该做什么？百度终于开始收录自己的网站了，作为站长，你是不是觉得那一刻很有成就感呢，同时，你是不是又很茫然，不知道下一步该做什么了？至少我当初就是这样，在这里和大家一份分享一下新站收录后，我们要做哪些工作。至于如何让百度快速收录自己的网站，可以参考我之前的帖子《新站让百
oracle 连接碰到的问题文强chu oracle
Unable to find a java Virtual Machine－－安装64位版Oracle11gR2后无法启动SQLDeveloper的解决方案作者：草根IT网来源：未知人气：813标签：导读：安装64位版Oracle11gR2后发现启动SQLDeveloper时弹出配置java.exe的路径，找到Oracle自带java.exe后产生的路径“C:\app\用户名\prod
Swing中按ctrl键同时移动鼠标拖动组件（类中多借口共享同一数据）小桔子 java 继承 swing 接口监听
都知道java中类只能单继承，但可以实现多个接口，但我发现实现多个接口之后，多个接口却不能共享同一个数据，应用开发中想实现：当用户按着ctrl键时，可以用鼠标点击拖动组件，比如说文本框。编写一个监听实现KeyListener,NouseListener,MouseMotionListener三个接口，重写方法。定义一个全局变量boolea
linux常用的命令 aichenglong linux 常用命令
1 startx切换到图形化界面 2 man命令:查看帮助信息 man 需要查看的命令,man命令提供了大量的帮助信息,一般可以分成4个部分 name:对命令的简单说明 synopsis:命令的使用格式说明 description:命令的详细说明信息 options:命令的各项说明 3 date:显示时间语法：date [OPTION]... [+FORMAT]
eclipse内存优化 AILIKES java eclipse jvm jdk
一基本说明在JVM中，总体上分2块内存区,默认空余堆内存小于 40%时，JVM就会增大堆直到-Xmx的最大限制；空余堆内存大于70%时，JVM会减少堆直到-Xms的最小限制。 1)堆内存(Heap memory):堆是运行时数据区域，所有类实例和数组的内存均从此处分配,是Java代码可及的内存，是留给开发人
关键字的使用探讨百合不是茶关键字
//关键字的使用探讨/*访问关键词private 只能在本类中访问public 只能在本工程中访问protected 只能在包中和子类中访问默认的只能在包中访问*//*final 类方法变量 final 类不能被继承 final 方法不能被子类覆盖，但可以继承 final 变量只能有一次赋值，赋值后不能改变 final 不能用来修饰构造方法*///this()
JS中定义对象的几种方式 bijian1013 js
1. 基于已有对象扩充其对象和方法(只适合于临时的生成一个对象)： <html> <head> <title>基于已有对象扩充其对象和方法(只适合于临时的生成一个对象)</title> </head> <script> var obj = new Object();
表驱动法实例 bijian1013 java 表驱动法 TDD
获得月的天数是典型的直接访问驱动表方式的实例，下面我们来展示一下： MonthDaysTest.java package com.study.test; import org.junit.Assert; import org.junit.Test; import com.study.MonthDays; public class MonthDaysTest { @T
LInux启停重启常用服务器的脚本 bit1129 linux
启动，停止和重启常用服务器的Bash脚本，对于每个服务器，需要根据实际的安装路径做相应的修改 #! /bin/bash Servers=(Apache2, Nginx, Resin, Tomcat, Couchbase, SVN, ActiveMQ, Mongo); Ops=(Start, Stop, Restart); currentDir=$(pwd); echo
【HBase六】REST操作HBase bit1129 hbase
HBase提供了REST风格的服务方便查看HBase集群的信息，以及执行增删改查操作 1. 启动和停止HBase REST 服务 1.1 启动REST服务前台启动（默认端口号8080） [hadoop@hadoop bin]$ ./hbase rest start 后台启动 hbase-daemon.sh start rest 启动时指定
大话zabbix 3.0设计假设 ronin47
What’s new in Zabbix 2.0? 去年开始使用Zabbix的时候，是1.8.X的版本，今年Zabbix已经跨入了2.0的时代。看了2.0的release notes，和performance相关的有下面几个： :: Performance improvements::Trigger related da
http错误码大全 byalias http协议 javaweb
响应码由三位十进制数字组成，它们出现在由HTTP服务器发送的响应的第一行。响应码分五种类型，由它们的第一位数字表示： 1）1xx：信息，请求收到，继续处理 2）2xx：成功，行为被成功地接受、理解和采纳 3）3xx：重定向，为了完成请求，必须进一步执行的动作 4）4xx：客户端错误，请求包含语法错误或者请求无法实现 5）5xx：服务器错误，服务器不能实现一种明显无效的请求
J2EE设计模式-Intercepting Filter bylijinnan java 设计模式数据结构
Intercepting Filter类似于职责链模式有两种实现其中一种是Filter之间没有联系，全部Filter都存放在FilterChain中，由FilterChain来有序或无序地把把所有Filter调用一遍。没有用到链表这种数据结构。示例如下： package com.ljn.filter.custom; import java.util.ArrayList;
修改jboss端口 chicony jboss
修改jboss端口 %JBOSS_HOME%\server\{服务实例名}\conf\bindingservice.beans\META-INF\bindings-jboss-beans.xml 中找到 <!-- The ports-default bindings are obtained by taking the base bindin
c++ 用类模版实现数组类 CrazyMizzz C++
最近c++学到数组类，写了代码将他实现，基本具有vector类的功能 #include<iostream> #include<string> #include<cassert> using namespace std; template<class T> class Array { public: //构造函数
hadoop dfs.datanode.du.reserved 预留空间配置方法 daizj hadoop 预留空间
对于datanode配置预留空间的方法为：在hdfs-site.xml添加如下配置 <property> <name>dfs.datanode.du.reserved</name> <value>10737418240</value>
mysql远程访问的设置 dcj3sjt126com mysql 防火墙
第一步: 激活网络设置你需要编辑mysql配置文件my.cnf. 通常状况，my.cnf放置于在以下目录： /etc/mysql/my.cnf (Debian linux) /etc/my.cnf （Red Hat Linux/Fedora Linux) /var/db/mysql/my.cnf (FreeBSD) 然后用vi编辑my.cnf，修改内容从以下行： [mysqld] 你所需要: 1
ios 使用特定的popToViewController返回到相应的Controller dcj3sjt126com controller
1、取navigationCtroller中的Controllers NSArray * ctrlArray = self.navigationController.viewControllers; 2、取出后，执行， [self.navigationController popToViewController:[ctrlArray objectAtIndex:0] animated:YES
Linux正则表达式和通配符的区别 eksliang 正则表达式通配符和正则表达式的区别通配符
转载请出自出处：http://eksliang.iteye.com/blog/1976579 首先得明白二者是截然不同的通配符只能用在shell命令中,用来处理字符串的的匹配。判断一个命令是否为bash shell(linux 默认的shell)的内置命令 type -t commad 返回结果含义 file 表示为外部命令 alias 表示该
Ubuntu Mysql Install and CONF gengzg Install
http://www.navicat.com.cn/download/navicat-for-mysql Step1: 下载Navicat ，网址：http://www.navicat.com/en/download/download.html Step2：进入下载目录，解压压缩包：tar -zxvf navicat11_mysql_en.tar.gz
批处理，删除文件bat huqiji windows dos
@echo off ::演示：删除指定路径下指定天数之前（以文件名中包含的日期字符串为准）的文件。 ::如果演示结果无误，把del前面的echo去掉，即可实现真正删除。 ::本例假设文件名中包含的日期字符串（比如：bak-2009-12-25.log） rem 指定待删除文件的存放路径 set SrcDir=C:/Test/BatHome rem 指定天数 set DaysAgo=1
跨浏览器兼容的HTML5视频音频播放器天梯梦 html5
HTML5的video和audio标签是用来在网页中加入视频和音频的标签，在支持html5的浏览器中不需要预先加载Adobe Flash浏览器插件就能轻松快速的播放视频和音频文件。而html5media.js可以在不支持html5的浏览器上使video和audio标签生效。 How to enable <video> and <audio> tags in
Bundle自定义数据传递 hm4123660 android Serializable 自定义数据传递 Bundle Parcelable
我们都知道Bundle可能过put****()方法添加各种基本类型的数据，Intent也可以通过putExtras(Bundle)将数据添加进去，然后通过startActivity()跳到下一下Activity的时候就把数据也传到下一个Activity了。如传递一个字符串到下一个Activity 把数据放到Intent
C＃：异步编程和线程的使用（.NET 4.5 ） powertoolsteam .net 线程 C#异步编程
异步编程和线程处理是并发或并行编程非常重要的功能特征。为了实现异步编程，可使用线程也可以不用。将异步与线程同时讲，将有助于我们更好的理解它们的特征。本文中涉及关键知识点 1. 异步编程 2. 线程的使用 3. 基于任务的异步模式 4. 并行编程 5. 总结异步编程什么是异步操作？异步操作是指某些操作能够独立运行，不依赖主流程或主其他处理流程。通常情况下，C＃程序
spark 查看 job history 日志 Stark_Summer 日志 spark history job
SPARK_HOME/conf 下: spark-defaults.conf 增加如下内容 spark.eventLog.enabled true spark.eventLog.dir hdfs://master:8020/var/log/spark spark.eventLog.compress true spark-env.sh 增加如下内容 export SP
SSH框架搭建 wangxiukai2015eye spring Hibernate struts
MyEclipse搭建SSH框架 Struts Spring Hibernate 1、new一个web project。 2、右键项目，为项目添加Struts支持。选择Struts2 Core Libraries -<MyEclipes-Library> 点击Finish。src目录下多了struts