Example #1
0
  private void sendFailIntermediateResultPartitionsRpcCall() {
    final SimpleSlot slot = this.assignedResource;

    if (slot != null) {
      final Instance instance = slot.getInstance();

      if (instance.isAlive()) {
        final ActorGateway gateway = instance.getActorGateway();

        // TODO For some tests this could be a problem when querying too early if all resources were
        // released
        gateway.tell(new FailIntermediateResultPartitions(attemptId));
      }
    }
  }
Example #2
0
  /**
   * Sends an UpdatePartitionInfo message to the instance of the consumerSlot.
   *
   * @param consumerSlot Slot to whose instance the message will be sent
   * @param updatePartitionInfo UpdatePartitionInfo message
   */
  private void sendUpdatePartitionInfoRpcCall(
      final SimpleSlot consumerSlot, final UpdatePartitionInfo updatePartitionInfo) {

    if (consumerSlot != null) {
      final Instance instance = consumerSlot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      Future<Object> futureUpdate = gateway.ask(updatePartitionInfo, timeout);

      futureUpdate.onFailure(
          new OnFailure() {
            @Override
            public void onFailure(Throwable failure) throws Throwable {
              fail(
                  new IllegalStateException(
                      "Update task on instance " + instance + " failed due to:", failure));
            }
          },
          executionContext);
    }
  }
  @Test
  public void testAddAndRemoveInstance() {
    try {
      Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());

      Instance i1 = getRandomInstance(2);
      Instance i2 = getRandomInstance(2);
      Instance i3 = getRandomInstance(2);

      assertEquals(0, scheduler.getNumberOfAvailableInstances());
      assertEquals(0, scheduler.getNumberOfAvailableSlots());
      scheduler.newInstanceAvailable(i1);
      assertEquals(1, scheduler.getNumberOfAvailableInstances());
      assertEquals(2, scheduler.getNumberOfAvailableSlots());
      scheduler.newInstanceAvailable(i2);
      assertEquals(2, scheduler.getNumberOfAvailableInstances());
      assertEquals(4, scheduler.getNumberOfAvailableSlots());
      scheduler.newInstanceAvailable(i3);
      assertEquals(3, scheduler.getNumberOfAvailableInstances());
      assertEquals(6, scheduler.getNumberOfAvailableSlots());

      // cannot add available instance again
      try {
        scheduler.newInstanceAvailable(i2);
        fail("Scheduler accepted instance twice");
      } catch (IllegalArgumentException e) {
        // bueno!
      }

      // some instances die
      assertEquals(3, scheduler.getNumberOfAvailableInstances());
      assertEquals(6, scheduler.getNumberOfAvailableSlots());
      scheduler.instanceDied(i2);
      assertEquals(2, scheduler.getNumberOfAvailableInstances());
      assertEquals(4, scheduler.getNumberOfAvailableSlots());

      // try to add a dead instance
      try {
        scheduler.newInstanceAvailable(i2);
        fail("Scheduler accepted dead instance");
      } catch (IllegalArgumentException e) {
        // stimmt

      }

      scheduler.instanceDied(i1);
      assertEquals(1, scheduler.getNumberOfAvailableInstances());
      assertEquals(2, scheduler.getNumberOfAvailableSlots());
      scheduler.instanceDied(i3);
      assertEquals(0, scheduler.getNumberOfAvailableInstances());
      assertEquals(0, scheduler.getNumberOfAvailableSlots());

      assertFalse(i1.isAlive());
      assertFalse(i2.isAlive());
      assertFalse(i3.isAlive());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  @Test
  public void testScheduleWithDyingInstances() {
    try {
      Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());

      Instance i1 = getRandomInstance(2);
      Instance i2 = getRandomInstance(2);
      Instance i3 = getRandomInstance(1);

      scheduler.newInstanceAvailable(i1);
      scheduler.newInstanceAvailable(i2);
      scheduler.newInstanceAvailable(i3);

      List<SimpleSlot> slots = new ArrayList<SimpleSlot>();
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
      slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());

      i2.markDead();

      for (SimpleSlot slot : slots) {
        if (slot.getOwner() == i2) {
          assertTrue(slot.isCanceled());
        } else {
          assertFalse(slot.isCanceled());
        }

        slot.releaseSlot();
      }

      assertEquals(3, scheduler.getNumberOfAvailableSlots());

      i1.markDead();
      i3.markDead();

      // cannot get another slot, since all instances are dead
      try {
        scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
        fail("Scheduler served a slot from a dead instance");
      } catch (NoResourceAvailableException e) {
        // fine
      } catch (Exception e) {
        fail("Wrong exception type.");
      }

      // now the latest, the scheduler should have noticed (through the lazy mechanisms)
      // that all instances have vanished
      assertEquals(0, scheduler.getNumberOfInstancesWithAvailableSlots());
      assertEquals(0, scheduler.getNumberOfAvailableSlots());
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
  /**
   * Returns a list of file input splits specifically ordered for the given {@link
   * org.apache.flink.runtime.instance.Instance}. When the list is initially created, it contains
   * all the unconsumed file input splits at that point in time, ascendingly ordered by the minimum
   * distance between the input splits' storage locations and the given {@link
   * org.apache.flink.runtime.instance.Instance}.
   *
   * @param instance the instance for which the file input split list has been computed
   * @return the list of file input splits ordered specifically for the given instance
   */
  private Queue<QueueElem> getInstanceSplitList(final Instance instance) {

    Queue<QueueElem> instanceSplitList = this.instanceMap.get(instance);
    if (instanceSplitList == null) {

      // Create and populate instance specific split list
      instanceSplitList = new PriorityQueue<FileInputSplitList.QueueElem>();
      final Iterator<FileInputSplit> it = this.masterSet.iterator();
      while (it.hasNext()) {

        final FileInputSplit split = it.next();
        final String[] hostNames = split.getHostNames();
        if (hostNames == null) {
          instanceSplitList.add(new QueueElem(split, Integer.MAX_VALUE));

        } else {

          int minDistance = Integer.MAX_VALUE;
          for (int i = 0; i < hostNames.length; ++i) {
            final int distance = instance.getDistance(hostNames[i]);
            if (LOG.isDebugEnabled()) {
              LOG.debug(
                  "Distance between " + instance + " and " + hostNames[i] + " is " + distance);
            }
            if (distance < minDistance) {
              minDistance = distance;
            }
          }

          instanceSplitList.add(new QueueElem(split, minDistance));
        }
      }

      this.instanceMap.put(instance, instanceSplitList);
    }

    return instanceSplitList;
  }
Example #6
0
  @Override
  public String handleJsonRequest(
      Map<String, String> pathParams, Map<String, String> queryParams, ActorGateway jobManager)
      throws Exception {
    try {
      if (jobManager != null) {
        // whether one task manager's metrics are requested, or all task manager, we
        // return them in an array. This avoids unnecessary code complexity.
        // If only one task manager is requested, we only fetch one task manager metrics.
        final List<Instance> instances = new ArrayList<>();
        if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
          try {
            InstanceID instanceID =
                new InstanceID(StringUtils.hexStringToByte(pathParams.get(TASK_MANAGER_ID_KEY)));
            Future<Object> future =
                jobManager.ask(
                    new JobManagerMessages.RequestTaskManagerInstance(instanceID), timeout);
            TaskManagerInstance instance = (TaskManagerInstance) Await.result(future, timeout);
            if (instance.instance().nonEmpty()) {
              instances.add(instance.instance().get());
            }
          }
          // this means the id string was invalid. Keep the list empty.
          catch (IllegalArgumentException e) {
            // do nothing.
          }
        } else {
          Future<Object> future =
              jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout);
          RegisteredTaskManagers taskManagers =
              (RegisteredTaskManagers) Await.result(future, timeout);
          instances.addAll(taskManagers.asJavaCollection());
        }

        StringWriter writer = new StringWriter();
        JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);

        gen.writeStartObject();
        gen.writeArrayFieldStart("taskmanagers");

        for (Instance instance : instances) {
          gen.writeStartObject();
          gen.writeStringField("id", instance.getId().toString());
          gen.writeStringField("path", instance.getActorGateway().path());
          gen.writeNumberField("dataPort", instance.getTaskManagerLocation().dataPort());
          gen.writeNumberField("timeSinceLastHeartbeat", instance.getLastHeartBeat());
          gen.writeNumberField("slotsNumber", instance.getTotalNumberOfSlots());
          gen.writeNumberField("freeSlots", instance.getNumberOfAvailableSlots());
          gen.writeNumberField("cpuCores", instance.getResources().getNumberOfCPUCores());
          gen.writeNumberField("physicalMemory", instance.getResources().getSizeOfPhysicalMemory());
          gen.writeNumberField("freeMemory", instance.getResources().getSizeOfJvmHeap());
          gen.writeNumberField("managedMemory", instance.getResources().getSizeOfManagedMemory());

          // only send metrics when only one task manager requests them.
          if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
            byte[] report = instance.getLastMetricsReport();
            if (report != null) {
              gen.writeFieldName("metrics");
              gen.writeRawValue(new String(report, "utf-8"));
            }
          }

          gen.writeEndObject();
        }

        gen.writeEndArray();
        gen.writeEndObject();

        gen.close();
        return writer.toString();
      } else {
        throw new Exception("No connection to the leading JobManager.");
      }
    } catch (Exception e) {
      throw new RuntimeException("Failed to fetch list of all task managers: " + e.getMessage(), e);
    }
  }
  @Test
  public void testBuildDeploymentDescriptor() {
    try {
      final JobID jobId = new JobID();

      final JobVertexID jid1 = new JobVertexID();
      final JobVertexID jid2 = new JobVertexID();
      final JobVertexID jid3 = new JobVertexID();
      final JobVertexID jid4 = new JobVertexID();

      JobVertex v1 = new JobVertex("v1", jid1);
      JobVertex v2 = new JobVertex("v2", jid2);
      JobVertex v3 = new JobVertex("v3", jid3);
      JobVertex v4 = new JobVertex("v4", jid4);

      v1.setParallelism(10);
      v2.setParallelism(10);
      v3.setParallelism(10);
      v4.setParallelism(10);

      v1.setInvokableClass(BatchTask.class);
      v2.setInvokableClass(BatchTask.class);
      v3.setInvokableClass(BatchTask.class);
      v4.setInvokableClass(BatchTask.class);

      v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL);
      v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);
      v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL);

      ExecutionGraph eg =
          new ExecutionGraph(
              TestingUtils.defaultExecutionContext(),
              jobId,
              "some job",
              new Configuration(),
              new SerializedValue<>(new ExecutionConfig()),
              AkkaUtils.getDefaultTimeout(),
              new NoRestartStrategy());

      List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4);

      eg.attachJobGraph(ordered);

      ExecutionJobVertex ejv = eg.getAllVertices().get(jid2);
      ExecutionVertex vertex = ejv.getTaskVertices()[3];

      ExecutionGraphTestUtils.SimpleActorGateway instanceGateway =
          new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext());

      final Instance instance = getInstance(instanceGateway);

      final SimpleSlot slot = instance.allocateSimpleSlot(jobId);

      assertEquals(ExecutionState.CREATED, vertex.getExecutionState());

      vertex.deployToSlot(slot);

      assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());

      TaskDeploymentDescriptor descr = instanceGateway.lastTDD;
      assertNotNull(descr);

      assertEquals(jobId, descr.getJobID());
      assertEquals(jid2, descr.getVertexID());
      assertEquals(3, descr.getIndexInSubtaskGroup());
      assertEquals(10, descr.getNumberOfSubtasks());
      assertEquals(BatchTask.class.getName(), descr.getInvokableClassName());
      assertEquals("v2", descr.getTaskName());

      List<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions();
      List<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates();

      assertEquals(2, producedPartitions.size());
      assertEquals(1, consumedPartitions.size());

      assertEquals(10, producedPartitions.get(0).getNumberOfSubpartitions());
      assertEquals(10, producedPartitions.get(1).getNumberOfSubpartitions());
      assertEquals(10, consumedPartitions.get(0).getInputChannelDeploymentDescriptors().length);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
  }
Example #8
0
  void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
    final int numConsumers = allConsumers.size();

    if (numConsumers > 1) {
      fail(
          new IllegalStateException(
              "Currently, only a single consumer group per partition is supported."));
    } else if (numConsumers == 0) {
      return;
    }

    for (ExecutionEdge edge : allConsumers.get(0)) {
      final ExecutionVertex consumerVertex = edge.getTarget();

      final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
      final ExecutionState consumerState = consumer.getState();

      final IntermediateResultPartition partition = edge.getSource();

      // ----------------------------------------------------------------
      // Consumer is created => try to deploy and cache input channel
      // descriptors if there is a deployment race
      // ----------------------------------------------------------------
      if (consumerState == CREATED) {
        final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();

        consumerVertex.cachePartitionInfo(
            PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));

        // When deploying a consuming task, its task deployment descriptor will contain all
        // deployment information available at the respective time. It is possible that some
        // of the partitions to be consumed have not been created yet. These are updated
        // runtime via the update messages.
        //
        // TODO The current approach may send many update messages even though the consuming
        // task has already been deployed with all necessary information. We have to check
        // whether this is a problem and fix it, if it is.
        future(
            new Callable<Boolean>() {
              @Override
              public Boolean call() throws Exception {
                try {
                  consumerVertex.scheduleForExecution(
                      consumerVertex.getExecutionGraph().getScheduler(),
                      consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
                } catch (Throwable t) {
                  fail(
                      new IllegalStateException(
                          "Could not schedule consumer " + "vertex " + consumerVertex, t));
                }

                return true;
              }
            },
            executionContext);

        // double check to resolve race conditions
        if (consumerVertex.getExecutionState() == RUNNING) {
          consumerVertex.sendPartitionInfos();
        }
      }
      // ----------------------------------------------------------------
      // Consumer is running => send update message now
      // ----------------------------------------------------------------
      else {
        if (consumerState == RUNNING) {
          final SimpleSlot consumerSlot = consumer.getAssignedResource();

          if (consumerSlot == null) {
            // The consumer has been reset concurrently
            continue;
          }

          final Instance consumerInstance = consumerSlot.getInstance();

          final ResultPartitionID partitionId =
              new ResultPartitionID(partition.getPartitionId(), attemptId);

          final Instance partitionInstance =
              partition.getProducer().getCurrentAssignedResource().getInstance();

          final ResultPartitionLocation partitionLocation;

          if (consumerInstance.equals(partitionInstance)) {
            // Consuming task is deployed to the same instance as the partition => local
            partitionLocation = ResultPartitionLocation.createLocal();
          } else {
            // Different instances => remote
            final ConnectionID connectionId =
                new ConnectionID(
                    partitionInstance.getInstanceConnectionInfo(),
                    partition.getIntermediateResult().getConnectionIndex());

            partitionLocation = ResultPartitionLocation.createRemote(connectionId);
          }

          final InputChannelDeploymentDescriptor descriptor =
              new InputChannelDeploymentDescriptor(partitionId, partitionLocation);

          final UpdatePartitionInfo updateTaskMessage =
              new UpdateTaskSinglePartitionInfo(
                  consumer.getAttemptId(), partition.getIntermediateResult().getId(), descriptor);

          sendUpdatePartitionInfoRpcCall(consumerSlot, updateTaskMessage);
        }
        // ----------------------------------------------------------------
        // Consumer is scheduled or deploying => cache input channel
        // deployment descriptors and send update message later
        // ----------------------------------------------------------------
        else if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
          final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();

          consumerVertex.cachePartitionInfo(
              PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));

          // double check to resolve race conditions
          if (consumerVertex.getExecutionState() == RUNNING) {
            consumerVertex.sendPartitionInfos();
          }
        }
      }
    }
  }
Example #9
0
  public void deployToSlot(final SimpleSlot slot) throws JobException {
    // sanity checks
    if (slot == null) {
      throw new NullPointerException();
    }
    if (!slot.isAlive()) {
      throw new JobException("Target slot for deployment is not alive.");
    }

    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
      if (!transitionState(previous, DEPLOYING)) {
        // race condition, someone else beat us to the deploying call.
        // this should actually not happen and indicates a race somewhere else
        throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
      }
    } else {
      // vertex may have been cancelled, or it was already scheduled
      throw new IllegalStateException(
          "The vertex must be in CREATED or SCHEDULED state to be deployed. Found state "
              + previous);
    }

    try {
      // good, we are allowed to deploy
      if (!slot.setExecutedVertex(this)) {
        throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
      }
      this.assignedResource = slot;
      this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo();

      // race double check, did we fail/cancel and do we need to release the slot?
      if (this.state != DEPLOYING) {
        slot.releaseSlot();
        return;
      }

      if (LOG.isInfoEnabled()) {
        LOG.info(
            String.format(
                "Deploying %s (attempt #%d) to %s",
                vertex.getSimpleName(),
                attemptNumber,
                slot.getInstance().getInstanceConnectionInfo().getHostname()));
      }

      final TaskDeploymentDescriptor deployment =
          vertex.createDeploymentDescriptor(
              attemptId, slot, operatorState, recoveryTimestamp, attemptNumber);

      // register this execution at the execution graph, to receive call backs
      vertex.getExecutionGraph().registerExecution(this);

      final Instance instance = slot.getInstance();
      final ActorGateway gateway = instance.getActorGateway();

      final Future<Object> deployAction = gateway.ask(new SubmitTask(deployment), timeout);

      deployAction.onComplete(
          new OnComplete<Object>() {

            @Override
            public void onComplete(Throwable failure, Object success) throws Throwable {
              if (failure != null) {
                if (failure instanceof TimeoutException) {
                  String taskname =
                      deployment.getTaskInfo().getTaskNameWithSubtasks() + " (" + attemptId + ')';

                  markFailed(
                      new Exception(
                          "Cannot deploy task "
                              + taskname
                              + " - TaskManager ("
                              + instance
                              + ") not responding after a timeout of "
                              + timeout,
                          failure));
                } else {
                  markFailed(failure);
                }
              } else {
                if (!(success.equals(Messages.getAcknowledge()))) {
                  markFailed(
                      new Exception(
                          "Failed to deploy the task to slot "
                              + slot
                              + ": Response was not of type Acknowledge"));
                }
              }
            }
          },
          executionContext);
    } catch (Throwable t) {
      markFailed(t);
      ExceptionUtils.rethrow(t);
    }
  }