@Test
  public void testScheduleWithDyingInstances() {
    try {
      Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());

      Instance i1 = getRandomInstance(2);
      Instance i2 = getRandomInstance(2);
      Instance i3 = getRandomInstance(1);

      scheduler.newInstanceAvailable(i1);
      scheduler.newInstanceAvailable(i2);
      scheduler.newInstanceAvailable(i3);

      List<SimpleSlot> slots = new ArrayList<SimpleSlot>();
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());

      i2.markDead();

      for (SimpleSlot slot : slots) {
        if (slot.getOwner() == i2) {
          assertTrue(slot.isCanceled());
        } else {
          assertFalse(slot.isCanceled());
        }

        slot.releaseSlot();
      }

      assertEquals(3, scheduler.getNumberOfAvailableSlots());

      i1.markDead();
      i3.markDead();

      // cannot get another slot, since all instances are dead
      try {
        scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
        fail("Scheduler served a slot from a dead instance");
      } catch (NoResourceAvailableException e) {
        // fine
      } catch (Exception e) {
        fail("Wrong exception type.");
      }

      // now the latest, the scheduler should have noticed (through the lazy mechanisms)
      // that all instances have vanished
      assertEquals(0, scheduler.getNumberOfInstancesWithAvailableSlots());
      assertEquals(0, scheduler.getNumberOfAvailableSlots());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
Esempio n. 2
0
  void markFinished(
      Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators,
      Map<String, Accumulator<?, ?>> userAccumulators) {

    // this call usually comes during RUNNING, but may also come while still in deploying (very fast
    // tasks!)
    while (true) {
      ExecutionState current = this.state;

      if (current == RUNNING || current == DEPLOYING) {

        if (transitionState(current, FINISHED)) {
          try {
            for (IntermediateResultPartition finishedPartition :
                getVertex().finishAllBlockingPartitions()) {

              IntermediateResultPartition[] allPartitions =
                  finishedPartition.getIntermediateResult().getPartitions();

              for (IntermediateResultPartition partition : allPartitions) {
                scheduleOrUpdateConsumers(partition.getConsumers());
              }
            }

            synchronized (accumulatorLock) {
              this.flinkAccumulators = flinkAccumulators;
              this.userAccumulators = userAccumulators;
            }

            assignedResource.releaseSlot();
            vertex.getExecutionGraph().deregisterExecution(this);
          } finally {
            vertex.executionFinished();
          }
          return;
        }
      } else if (current == CANCELING) {
        // we sent a cancel call, and the task manager finished before it arrived. We
        // will never get a CANCELED call back from the job manager
        cancelingComplete();
        return;
      } else if (current == CANCELED || current == FAILED) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Task FINISHED, but concurrently went to state " + state);
        }
        return;
      } else {
        // this should not happen, we need to fail this
        markFailed(new Exception("Vertex received FINISHED message while being in state " + state));
        return;
      }
    }
  }
Esempio n. 3
0
  public void cancel() {
    // depending on the previous state, we go directly to cancelled (no cancel call necessary)
    // -- or to canceling (cancel call needs to be sent to the task manager)

    // because of several possibly previous states, we need to again loop until we make a
    // successful atomic state transition
    while (true) {

      ExecutionState current = this.state;

      if (current == CANCELING || current == CANCELED) {
        // already taken care of, no need to cancel again
        return;
      }

      // these two are the common cases where we need to send a cancel call
      else if (current == RUNNING || current == DEPLOYING) {
        // try to transition to canceling, if successful, send the cancel call
        if (transitionState(current, CANCELING)) {
          sendCancelRpcCall();
          return;
        }
        // else: fall through the loop
      } else if (current == FINISHED || current == FAILED) {
        // nothing to do any more. finished failed before it could be cancelled.
        // in any case, the task is removed from the TaskManager already
        sendFailIntermediateResultPartitionsRpcCall();

        return;
      } else if (current == CREATED || current == SCHEDULED) {
        // from here, we can directly switch to cancelled, because the no task has been deployed
        if (transitionState(current, CANCELED)) {

          // we skip the canceling state. set the timestamp, for a consistent appearance
          markTimestamp(CANCELING, getStateTimestamp(CANCELED));

          try {
            vertex.getExecutionGraph().deregisterExecution(this);
            if (assignedResource != null) {
              assignedResource.releaseSlot();
            }
          } finally {
            vertex.executionCanceled();
          }
          return;
        }
        // else: fall through the loop
      } else {
        throw new IllegalStateException(current.name());
      }
    }
  }
Esempio n. 4
0
  void cancelingComplete() {

    // the taskmanagers can themselves cancel tasks without an external trigger, if they find that
    // the
    // network stack is canceled (for example by a failing / canceling receiver or sender
    // this is an artifact of the old network runtime, but for now we need to support task
    // transitions
    // from running directly to canceled

    while (true) {
      ExecutionState current = this.state;

      if (current == CANCELED) {
        return;
      } else if (current == CANCELING || current == RUNNING || current == DEPLOYING) {
        if (transitionState(current, CANCELED)) {
          try {
            assignedResource.releaseSlot();
            vertex.getExecutionGraph().deregisterExecution(this);
          } finally {
            vertex.executionCanceled();
          }
          return;
        }

        // else fall through the loop
      } else {
        // failing in the meantime may happen and is no problem.
        // anything else is a serious problem !!!
        if (current != FAILED) {
          String message =
              String.format(
                  "Asynchronous race: Found state %s after successful cancel call.", state);
          LOG.error(message);
          vertex.getExecutionGraph().fail(new Exception(message));
        }
        return;
      }
    }
  }
Esempio n. 5
0
  @Test
  public void testAllocatingAndCancellingSlots() {
    try {
      ResourceID resourceID = ResourceID.generate();
      HardwareDescription hardwareDescription =
          new HardwareDescription(
              4, 2L * 1024 * 1024 * 1024, 1024 * 1024 * 1024, 512 * 1024 * 1024);
      InetAddress address = InetAddress.getByName("127.0.0.1");
      TaskManagerLocation connection = new TaskManagerLocation(resourceID, address, 10001);

      Instance instance =
          new Instance(
              new ActorTaskManagerGateway(DummyActorGateway.INSTANCE),
              connection,
              new InstanceID(),
              hardwareDescription,
              4);

      assertEquals(4, instance.getTotalNumberOfSlots());
      assertEquals(4, instance.getNumberOfAvailableSlots());
      assertEquals(0, instance.getNumberOfAllocatedSlots());

      SimpleSlot slot1 = instance.allocateSimpleSlot(new JobID());
      SimpleSlot slot2 = instance.allocateSimpleSlot(new JobID());
      SimpleSlot slot3 = instance.allocateSimpleSlot(new JobID());
      SimpleSlot slot4 = instance.allocateSimpleSlot(new JobID());

      assertNotNull(slot1);
      assertNotNull(slot2);
      assertNotNull(slot3);
      assertNotNull(slot4);

      assertEquals(0, instance.getNumberOfAvailableSlots());
      assertEquals(4, instance.getNumberOfAllocatedSlots());
      assertEquals(
          6,
          slot1.getSlotNumber()
              + slot2.getSlotNumber()
              + slot3.getSlotNumber()
              + slot4.getSlotNumber());

      // no more slots
      assertNull(instance.allocateSimpleSlot(new JobID()));
      try {
        instance.returnAllocatedSlot(slot2);
        fail("instance accepted a non-cancelled slot.");
      } catch (IllegalArgumentException e) {
        // good
      }

      // release the slots. this returns them to the instance
      slot1.releaseSlot();
      slot2.releaseSlot();
      slot3.releaseSlot();
      slot4.releaseSlot();

      assertEquals(4, instance.getNumberOfAvailableSlots());
      assertEquals(0, instance.getNumberOfAllocatedSlots());

      assertFalse(instance.returnAllocatedSlot(slot1));
      assertFalse(instance.returnAllocatedSlot(slot2));
      assertFalse(instance.returnAllocatedSlot(slot3));
      assertFalse(instance.returnAllocatedSlot(slot4));

      assertEquals(4, instance.getNumberOfAvailableSlots());
      assertEquals(0, instance.getNumberOfAllocatedSlots());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
Esempio n. 6
0
  private boolean processFail(Throwable t, boolean isCallback) {

    // damn, we failed. This means only that we keep our books and notify our parent
    // JobExecutionVertex
    // the actual computation on the task manager is cleaned up by the TaskManager that noticed the
    // failure

    // we may need to loop multiple times (in the presence of concurrent calls) in order to
    // atomically switch to failed
    while (true) {
      ExecutionState current = this.state;

      if (current == FAILED) {
        // already failed. It is enough to remember once that we failed (its sad enough)
        return false;
      }

      if (current == CANCELED) {
        // we are already aborting or are already aborted
        if (LOG.isDebugEnabled()) {
          LOG.debug(
              String.format(
                  "Ignoring transition of vertex %s to %s while being %s",
                  getVertexWithAttempt(), FAILED, CANCELED));
        }
        return false;
      }

      if (transitionState(current, FAILED, t)) {
        // success (in a manner of speaking)
        this.failureCause = t;

        try {
          if (assignedResource != null) {
            assignedResource.releaseSlot();
          }
          vertex.getExecutionGraph().deregisterExecution(this);
        } finally {
          vertex.executionFailed(t);
        }

        if (!isCallback && (current == RUNNING || current == DEPLOYING)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Sending out cancel request, to remove task execution from TaskManager.");
          }

          try {
            if (assignedResource != null) {
              sendCancelRpcCall();
            }
          } catch (Throwable tt) {
            // no reason this should ever happen, but log it to be safe
            LOG.error("Error triggering cancel call while marking task as failed.", tt);
          }
        }

        // leave the loop
        return true;
      }
    }
  }
Esempio n. 7
0
  public void deployToSlot(final SimpleSlot slot) throws JobException {
    // sanity checks
    if (slot == null) {
      throw new NullPointerException();
    }
    if (!slot.isAlive()) {
      throw new JobException("Target slot for deployment is not alive.");
    }

    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
      if (!transitionState(previous, DEPLOYING)) {
        // race condition, someone else beat us to the deploying call.
        // this should actually not happen and indicates a race somewhere else
        throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
      }
    } else {
      // vertex may have been cancelled, or it was already scheduled
      throw new IllegalStateException(
          "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state "
              + previous);
    }

    try {
      // good, we are allowed to deploy
      if (!slot.setExecutedVertex(this)) {
        throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
      }
      this.assignedResource = slot;
      this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo();

      // race double check, did we fail/cancel and do we need to release the slot?
      if (this.state != DEPLOYING) {
        slot.releaseSlot();
        return;
      }

      if (LOG.isInfoEnabled()) {
        LOG.info(
            String.format(
                "Deploying %s (attempt #%d) to %s",
                vertex.getSimpleName(),
                attemptNumber,
                slot.getInstance().getInstanceConnectionInfo().getHostname()));
      }

      final TaskDeploymentDescriptor deployment =
          vertex.createDeploymentDescriptor(
              attemptId, slot, operatorState, recoveryTimestamp, attemptNumber);

      // register this execution at the execution graph, to receive call backs
      vertex.getExecutionGraph().registerExecution(this);

      final Instance instance = slot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout);

      deployAction.onComplete(
          new OnComplete<Object>() {

            @Override
            public void onComplete(Throwable failure, Object success) throws Throwable {
              if (failure != null) {
                if (failure instanceof TimeoutException) {
                  String taskname =
                      deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')';

                  markFailed(
                      new Exception(
                          "Cannot deploy task "
                              + taskname
                              + " - TaskManager ("
                              + instance
                              + ") not responding after a timeout of "
                              + timeout,
                          failure));
                } else {
                  markFailed(failure);
                }
              } else {
                if (!(success.equals(Messages.getAcknowledge()))) {
                  markFailed(
                      new Exception(
                          "Failed to deploy the task to slot "
                              + slot
                              + ": Response was not of type Acknowledge"));
                }
              }
            }
          },
          executionContext);
    } catch (Throwable t) {
      markFailed(t);
      ExceptionUtils.rethrow(t);
    }
  }
Esempio n. 8
0
  /**
   * NOTE: This method only throws exceptions if it is in an illegal state to be scheduled, or if
   * the tasks needs to be scheduled immediately and no resource is available. If the task is
   * accepted by the schedule, any error sets the vertex state to failed and triggers the recovery
   * logic.
   *
   * @param scheduler The scheduler to use to schedule this execution attempt.
   * @param queued Flag to indicate whether the scheduler may queue this task if it cannot
   *     immediately deploy it.
   * @throws IllegalStateException Thrown, if the vertex is not in CREATED state, which is the only
   *     state that permits scheduling.
   * @throws NoResourceAvailableException Thrown is no queued scheduling is allowed and no resources
   *     are currently available.
   */
  public boolean scheduleForExecution(Scheduler scheduler, boolean queued)
      throws NoResourceAvailableException {
    if (scheduler == null) {
      throw new IllegalArgumentException("Cannot send null Scheduler when scheduling execution.");
    }

    final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup();
    final CoLocationConstraint locationConstraint = vertex.getLocationConstraint();

    // sanity check
    if (locationConstraint != null && sharingGroup == null) {
      throw new RuntimeException(
          "Trying to schedule with co-location constraint but without slot sharing allowed.");
    }

    if (transitionState(CREATED, SCHEDULED)) {

      ScheduledUnit toSchedule =
          locationConstraint == null
              ? new ScheduledUnit(this, sharingGroup)
              : new ScheduledUnit(this, sharingGroup, locationConstraint);

      // IMPORTANT: To prevent leaks of cluster resources, we need to make sure that slots are
      // returned
      //     in all cases where the deployment failed. we use many try {} finally {} clauses to
      // assure that
      if (queued) {
        SlotAllocationFuture future = scheduler.scheduleQueued(toSchedule);

        future.setFutureAction(
            new SlotAllocationFutureAction() {
              @Override
              public void slotAllocated(SimpleSlot slot) {
                try {
                  deployToSlot(slot);
                } catch (Throwable t) {
                  try {
                    slot.releaseSlot();
                  } finally {
                    markFailed(t);
                  }
                }
              }
            });
      } else {
        SimpleSlot slot = scheduler.scheduleImmediately(toSchedule);
        try {
          deployToSlot(slot);
        } catch (Throwable t) {
          try {
            slot.releaseSlot();
          } finally {
            markFailed(t);
          }
        }
      }

      return true;
    } else {
      // call race, already deployed, or already done
      return false;
    }
  }
  @Test
  public void testSchedulingLocation() {
    try {
      Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());

      Instance i1 = getRandomInstance(2);
      Instance i2 = getRandomInstance(2);
      Instance i3 = getRandomInstance(2);

      scheduler.newInstanceAvailable(i1);
      scheduler.newInstanceAvailable(i2);
      scheduler.newInstanceAvailable(i3);

      // schedule something on an arbitrary instance
      SimpleSlot s1 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(new Instance[0])), false).get();

      // figure out how we use the location hints
      Instance first = (Instance) s1.getOwner();
      Instance second = first != i1 ? i1 : i2;
      Instance third = first == i3 ? i2 : i3;

      // something that needs to go to the first instance again
      SimpleSlot s2 =
          scheduler
              .allocateSlot(new ScheduledUnit(getTestVertex(s1.getTaskManagerLocation())), false)
              .get();
      assertEquals(first, s2.getOwner());

      // first or second --> second, because first is full
      SimpleSlot s3 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(first, second)), false).get();
      assertEquals(second, s3.getOwner());

      // first or third --> third (because first is full)
      SimpleSlot s4 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(first, third)), false).get();
      SimpleSlot s5 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(first, third)), false).get();
      assertEquals(third, s4.getOwner());
      assertEquals(third, s5.getOwner());

      // first or third --> second, because all others are full
      SimpleSlot s6 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(first, third)), false).get();
      assertEquals(second, s6.getOwner());

      // release something on the first and second instance
      s2.releaseSlot();
      s6.releaseSlot();

      SimpleSlot s7 =
          scheduler.allocateSlot(new ScheduledUnit(getTestVertex(first, third)), false).get();
      assertEquals(first, s7.getOwner());

      assertEquals(1, scheduler.getNumberOfUnconstrainedAssignments());
      assertEquals(1, scheduler.getNumberOfNonLocalizedAssignments());
      assertEquals(5, scheduler.getNumberOfLocalizedAssignments());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  @Test
  public void testScheduleImmediately() {
    try {
      Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
      assertEquals(0, scheduler.getNumberOfAvailableSlots());

      scheduler.newInstanceAvailable(getRandomInstance(2));
      scheduler.newInstanceAvailable(getRandomInstance(1));
      scheduler.newInstanceAvailable(getRandomInstance(2));
      assertEquals(5, scheduler.getNumberOfAvailableSlots());

      // schedule something into all slots
      SimpleSlot s1 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
      SimpleSlot s2 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
      SimpleSlot s3 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
      SimpleSlot s4 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
      SimpleSlot s5 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();

      // the slots should all be different
      assertTrue(areAllDistinct(s1, s2, s3, s4, s5));

      try {
        scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false);
        fail("Scheduler accepted scheduling request without available resource.");
      } catch (NoResourceAvailableException e) {
        // pass!
      }

      // release some slots again
      s3.releaseSlot();
      s4.releaseSlot();
      assertEquals(2, scheduler.getNumberOfAvailableSlots());

      // now we can schedule some more slots
      SimpleSlot s6 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
      SimpleSlot s7 = scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();

      assertTrue(areAllDistinct(s1, s2, s3, s4, s5, s6, s7));

      // release all

      s1.releaseSlot();
      s2.releaseSlot();
      s5.releaseSlot();
      s6.releaseSlot();
      s7.releaseSlot();

      assertEquals(5, scheduler.getNumberOfAvailableSlots());

      // check that slots that are released twice (accidentally) do not mess things up

      s1.releaseSlot();
      s2.releaseSlot();
      s5.releaseSlot();
      s6.releaseSlot();
      s7.releaseSlot();

      assertEquals(5, scheduler.getNumberOfAvailableSlots());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }