public class DAGScheduler extends Object implements Logging
In addition to coming up with a DAG of stages, this class also determines the preferred locations to run each task on, based on the current cache status, and passes these to the low-level TaskScheduler. Furthermore, it handles failures due to shuffle output files being lost, in which case old stages may need to be resubmitted. Failures *within* a stage that are not caused by shuffle file loss are handled by the TaskScheduler, which will retry each task a small number of times before cancelling the whole stage.
Constructor and Description |
---|
DAGScheduler(SparkContext sc) |
DAGScheduler(SparkContext sc,
TaskScheduler taskScheduler) |
DAGScheduler(SparkContext sc,
TaskScheduler taskScheduler,
LiveListenerBus listenerBus,
MapOutputTrackerMaster mapOutputTracker,
BlockManagerMaster blockManagerMaster,
SparkEnv env,
Clock clock) |
Modifier and Type | Method and Description |
---|---|
void |
abortStage(Stage failedStage,
String reason)
Aborts all jobs depending on a particular Stage.
|
scala.collection.mutable.HashSet<ActiveJob> |
activeJobs() |
void |
cancelAllJobs()
Cancel all jobs that are running or waiting in the queue.
|
void |
cancelJob(int jobId)
Cancel a job that is running or waiting in the queue.
|
void |
cancelJobGroup(String groupId) |
void |
cancelStage(int stageId)
Cancel all jobs associated with a running or scheduled stage.
|
void |
cleanUpAfterSchedulerStop() |
void |
doCancelAllJobs() |
akka.actor.ActorRef |
eventProcessActor() |
void |
executorAdded(String execId,
String host) |
boolean |
executorHeartbeatReceived(String execId,
scala.Tuple4<Object,Object,Object,org.apache.spark.executor.TaskMetrics>[] taskMetrics,
BlockManagerId blockManagerId)
Update metrics for in-progress tasks and let the master know that the BlockManager is still
alive.
|
void |
executorLost(String execId) |
scala.collection.mutable.HashSet<Stage> |
failedStages() |
scala.collection.Seq<TaskLocation> |
getPreferredLocs(RDD<?> rdd,
int partition)
Synchronized method that might be called from other threads.
|
void |
handleBeginEvent(Task<?> task,
TaskInfo taskInfo) |
void |
handleExecutorAdded(String execId,
String host) |
void |
handleExecutorLost(String execId,
boolean fetchFailed,
scala.Option<Object> maybeEpoch)
Responds to an executor being lost.
|
void |
handleGetTaskResult(TaskInfo taskInfo) |
void |
handleJobCancellation(int jobId,
String reason) |
void |
handleJobGroupCancelled(String groupId) |
void |
handleJobSubmitted(int jobId,
RDD<?> finalRDD,
scala.Function2<TaskContext,scala.collection.Iterator<Object>,?> func,
int[] partitions,
boolean allowLocal,
CallSite callSite,
JobListener listener,
java.util.Properties properties) |
void |
handleStageCancellation(int stageId) |
void |
handleTaskCompletion(CompletionEvent event)
Responds to a task finishing.
|
void |
handleTaskSetFailed(TaskSet taskSet,
String reason) |
scala.collection.mutable.HashMap<Object,ActiveJob> |
jobIdToActiveJob() |
scala.collection.mutable.HashMap<Object,scala.collection.mutable.HashSet<Object>> |
jobIdToStageIds() |
java.util.concurrent.atomic.AtomicInteger |
nextJobId() |
int |
numTotalJobs() |
static long |
POLL_TIMEOUT() |
static scala.concurrent.duration.FiniteDuration |
RESUBMIT_TIMEOUT() |
void |
resubmitFailedStages()
Resubmit any failed stages.
|
<T,U,R> PartialResult<R> |
runApproximateJob(RDD<T> rdd,
scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func,
ApproximateEvaluator<U,R> evaluator,
CallSite callSite,
long timeout,
java.util.Properties properties) |
<T,U> void |
runJob(RDD<T> rdd,
scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func,
scala.collection.Seq<Object> partitions,
CallSite callSite,
boolean allowLocal,
scala.Function2<Object,U,scala.runtime.BoxedUnit> resultHandler,
java.util.Properties properties,
scala.reflect.ClassTag<U> evidence$1) |
scala.collection.mutable.HashSet<Stage> |
runningStages() |
SparkContext |
sc() |
scala.collection.mutable.HashMap<Object,Stage> |
shuffleToMapStage() |
scala.collection.mutable.HashMap<Object,Stage> |
stageIdToStage() |
void |
stop() |
<T,U> JobWaiter<U> |
submitJob(RDD<T> rdd,
scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func,
scala.collection.Seq<Object> partitions,
CallSite callSite,
boolean allowLocal,
scala.Function2<Object,U,scala.runtime.BoxedUnit> resultHandler,
java.util.Properties properties)
Submit a job to the job scheduler and get a JobWaiter object back.
|
void |
taskEnded(Task<?> task,
TaskEndReason reason,
Object result,
scala.collection.mutable.Map<Object,Object> accumUpdates,
TaskInfo taskInfo,
org.apache.spark.executor.TaskMetrics taskMetrics) |
void |
taskGettingResult(TaskInfo taskInfo) |
TaskScheduler |
taskScheduler() |
void |
taskSetFailed(TaskSet taskSet,
String reason) |
void |
taskStarted(Task<?> task,
TaskInfo taskInfo) |
scala.collection.mutable.HashSet<Stage> |
waitingStages() |
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
initializeIfNecessary, initializeLogging, isTraceEnabled, log_, log, logDebug, logDebug, logError, logError, logInfo, logInfo, logName, logTrace, logTrace, logWarning, logWarning
public DAGScheduler(SparkContext sc, TaskScheduler taskScheduler, LiveListenerBus listenerBus, MapOutputTrackerMaster mapOutputTracker, BlockManagerMaster blockManagerMaster, SparkEnv env, Clock clock)
public DAGScheduler(SparkContext sc, TaskScheduler taskScheduler)
public DAGScheduler(SparkContext sc)
public static scala.concurrent.duration.FiniteDuration RESUBMIT_TIMEOUT()
public static long POLL_TIMEOUT()
public SparkContext sc()
public TaskScheduler taskScheduler()
public java.util.concurrent.atomic.AtomicInteger nextJobId()
public int numTotalJobs()
public scala.collection.mutable.HashMap<Object,scala.collection.mutable.HashSet<Object>> jobIdToStageIds()
public scala.collection.mutable.HashMap<Object,Stage> stageIdToStage()
public scala.collection.mutable.HashMap<Object,Stage> shuffleToMapStage()
public scala.collection.mutable.HashMap<Object,ActiveJob> jobIdToActiveJob()
public scala.collection.mutable.HashSet<Stage> waitingStages()
public scala.collection.mutable.HashSet<Stage> runningStages()
public scala.collection.mutable.HashSet<Stage> failedStages()
public scala.collection.mutable.HashSet<ActiveJob> activeJobs()
public akka.actor.ActorRef eventProcessActor()
public void taskGettingResult(TaskInfo taskInfo)
public void taskEnded(Task<?> task, TaskEndReason reason, Object result, scala.collection.mutable.Map<Object,Object> accumUpdates, TaskInfo taskInfo, org.apache.spark.executor.TaskMetrics taskMetrics)
public boolean executorHeartbeatReceived(String execId, scala.Tuple4<Object,Object,Object,org.apache.spark.executor.TaskMetrics>[] taskMetrics, BlockManagerId blockManagerId)
public void executorLost(String execId)
public void executorAdded(String execId, String host)
public void taskSetFailed(TaskSet taskSet, String reason)
public <T,U> JobWaiter<U> submitJob(RDD<T> rdd, scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func, scala.collection.Seq<Object> partitions, CallSite callSite, boolean allowLocal, scala.Function2<Object,U,scala.runtime.BoxedUnit> resultHandler, java.util.Properties properties)
public <T,U> void runJob(RDD<T> rdd, scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func, scala.collection.Seq<Object> partitions, CallSite callSite, boolean allowLocal, scala.Function2<Object,U,scala.runtime.BoxedUnit> resultHandler, java.util.Properties properties, scala.reflect.ClassTag<U> evidence$1)
public <T,U,R> PartialResult<R> runApproximateJob(RDD<T> rdd, scala.Function2<TaskContext,scala.collection.Iterator<T>,U> func, ApproximateEvaluator<U,R> evaluator, CallSite callSite, long timeout, java.util.Properties properties)
public void cancelJob(int jobId)
public void cancelJobGroup(String groupId)
public void cancelAllJobs()
public void doCancelAllJobs()
public void cancelStage(int stageId)
public void resubmitFailedStages()
public void handleJobGroupCancelled(String groupId)
public void handleTaskSetFailed(TaskSet taskSet, String reason)
public void cleanUpAfterSchedulerStop()
public void handleGetTaskResult(TaskInfo taskInfo)
public void handleJobSubmitted(int jobId, RDD<?> finalRDD, scala.Function2<TaskContext,scala.collection.Iterator<Object>,?> func, int[] partitions, boolean allowLocal, CallSite callSite, JobListener listener, java.util.Properties properties)
public void handleTaskCompletion(CompletionEvent event)
public void handleExecutorLost(String execId, boolean fetchFailed, scala.Option<Object> maybeEpoch)
We will also assume that we've lost all shuffle blocks associated with the executor if the executor serves its own blocks (i.e., we're not using external shuffle) OR a FetchFailed occurred, in which case we presume all shuffle data related to this executor to be lost.
Optionally the epoch during which the failure was caught can be passed to avoid allowing stray fetch failures from possibly retriggering the detection of a node as lost.
public void handleExecutorAdded(String execId, String host)
public void handleStageCancellation(int stageId)
public void handleJobCancellation(int jobId, String reason)
public void abortStage(Stage failedStage, String reason)
public scala.collection.Seq<TaskLocation> getPreferredLocs(RDD<?> rdd, int partition)
rdd
- whose partitions are to be looked atpartition
- to lookup locality information forpublic void stop()