コード例 #1
0
  /** Constructor. */
  public ClusterManager() {

    this.registeredHosts = new HashMap<InstanceConnectionInfo, ClusterInstance>();

    this.slicesOfJobs = new HashMap<JobID, List<AllocatedSlice>>();

    // Load the instance type this cluster can offer
    this.availableInstanceTypes = populateInstanceTypeArray();

    this.instanceAccommodationMatrix = calculateInstanceAccommodationMatrix();

    this.instanceTypeDescriptionMap =
        new SerializableHashMap<InstanceType, InstanceTypeDescription>();

    long tmpCleanUpInterval =
        (long) GlobalConfiguration.getInteger(CLEANUP_INTERVAL_KEY, DEFAULT_CLEANUP_INTERVAL)
            * 1000;

    if (tmpCleanUpInterval < 10) { // Clean up interval must be at least ten seconds
      LOG.warn(
          "Invalid clean up interval. Reverting to default cleanup interval of "
              + DEFAULT_CLEANUP_INTERVAL
              + " secs.");
      tmpCleanUpInterval = DEFAULT_CLEANUP_INTERVAL;
    }

    this.cleanUpInterval = tmpCleanUpInterval;

    int tmpDefaultInstanceTypeIndex =
        GlobalConfiguration.getInteger(
            DEFAULT_INSTANCE_TYPE_INDEX_KEY, ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX);

    if (tmpDefaultInstanceTypeIndex > this.availableInstanceTypes.length) {
      LOG.warn(
          "Incorrect index to for default instance type ("
              + tmpDefaultInstanceTypeIndex
              + "), switching to default index "
              + ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX);

      tmpDefaultInstanceTypeIndex = ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX;
    }

    this.defaultInstanceType = this.availableInstanceTypes[tmpDefaultInstanceTypeIndex - 1];

    // sort available instances by CPU core
    sortAvailableInstancesByNumberOfCPUCores();

    // load the network topology from the slave file
    this.networkTopology = loadNetworkTopology();

    // load IP to instance type mapping from slave file
    loadIPToInstanceTypeMapping();

    // look every BASEINTERVAL milliseconds for crashed hosts
    final boolean runTimerAsDaemon = true;
    new Timer(runTimerAsDaemon).schedule(cleanupStaleMachines, 1000, 1000);

    // Load available instance types into the instance description list
    updateInstaceTypeDescriptionMap();
  }
コード例 #2
0
  /** This method send the periodic heartbeats. */
  private void runHeartbeatLoop() {
    final long interval =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_HEARTBEAT_INTERVAL_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_HEARTBEAT_INTERVAL);

    try {
      while (!shutdownStarted.get()) {
        RegisterTaskManagerResult result =
            this.jobManager.registerTaskManager(
                this.localInstanceConnectionInfo,
                this.hardwareDescription,
                new IntegerRecord(this.numberOfSlots));

        if (result.getReturnCode() == RegisterTaskManagerResult.ReturnCode.SUCCESS) {
          break;
        }

        try {
          Thread.sleep(50);
        } catch (InterruptedException e) {
          if (!shutdownStarted.get()) {
            LOG.error("TaskManager register task manager loop was interrupted without shutdown.");
          }
        }
      }

    } catch (IOException e) {
      if (!shutdownStarted.get()) {
        LOG.error("Registering task manager caused an exception: " + e.getMessage(), e);
      }
      return;
    }

    while (!shutdownStarted.get()) {
      // sleep until the next heart beat
      try {
        Thread.sleep(interval);
      } catch (InterruptedException e) {
        if (!shutdownStarted.get()) {
          LOG.error("TaskManager heart beat loop was interrupted without shutdown.");
        }
      }

      // send heart beat
      try {
        LOG.debug("heartbeat");
        this.jobManager.sendHeartbeat(this.localInstanceConnectionInfo);
      } catch (IOException e) {
        if (shutdownStarted.get()) {
          break;
        } else {
          LOG.error("Sending the heart beat caused an exception: " + e.getMessage(), e);
        }
      }
    }
  }
コード例 #3
0
  /**
   * All parameters are obtained from the {@link GlobalConfiguration}, which must be loaded prior to
   * instantiating the task manager.
   */
  public TaskManager(ExecutionMode executionMode) throws Exception {
    if (executionMode == null) {
      throw new NullPointerException("Execution mode must not be null.");
    }

    RevisionInformation rev = JobManagerUtils.getRevisionInformation();
    LOG.info(
        "Starting Stratosphere TaskManager "
            + "(Version: "
            + JobManagerUtils.getVersion()
            + ", "
            + "Rev:"
            + rev.commitId
            + ", "
            + "Date:"
            + rev.commitDate
            + ")");

    try {
      LOG.info(
          "TaskManager started as user "
              + UserGroupInformation.getCurrentUser().getShortUserName());
    } catch (Throwable t) {
      LOG.error("Cannot determine user group information.", t);
    }

    LOG.info("Execution mode: " + executionMode);

    // IMPORTANT! At this point, the GlobalConfiguration must have been read!

    final InetSocketAddress jobManagerAddress;
    {
      LOG.info("Reading location of job manager from configuration");

      final String address =
          GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null);
      final int port =
          GlobalConfiguration.getInteger(
              ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
              ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);

      if (address == null) {
        throw new Exception("Job manager address not configured in the GlobalConfiguration.");
      }

      // Try to convert configured address to {@link InetAddress}
      try {
        final InetAddress tmpAddress = InetAddress.getByName(address);
        jobManagerAddress = new InetSocketAddress(tmpAddress, port);
      } catch (UnknownHostException e) {
        LOG.fatal("Could not resolve JobManager host name.");
        throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e);
      }

      LOG.info("Connecting to JobManager at: " + jobManagerAddress);
    }

    // Create RPC connection to the JobManager
    try {
      this.jobManager =
          RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e);
      throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e);
    }

    int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1);
    int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1);
    if (ipcPort == -1) {
      ipcPort = getAvailablePort();
    }
    if (dataPort == -1) {
      dataPort = getAvailablePort();
    }

    // Determine our own public facing address and start the server
    {
      final InetAddress taskManagerAddress;
      try {
        taskManagerAddress = getTaskManagerAddress(jobManagerAddress);
      } catch (Exception e) {
        throw new RuntimeException(
            "The TaskManager failed to determine its own network address.", e);
      }

      this.localInstanceConnectionInfo =
          new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort);
      LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo);

      // Start local RPC server
      try {
        this.taskManagerServer =
            RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT);
        this.taskManagerServer.start();
      } catch (IOException e) {
        LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e);
        throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e);
      }
    }

    // Try to create local stub of the global input split provider
    try {
      this.globalInputSplitProvider =
          RPC.getProxy(
              InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal(e.getMessage(), e);
      throw new Exception(
          "Failed to initialize connection to global input split provider: " + e.getMessage(), e);
    }

    // Try to create local stub for the lookup service
    try {
      this.lookupService =
          RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal(e.getMessage(), e);
      throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e);
    }

    // Try to create local stub for the accumulators
    try {
      this.accumulatorProtocolProxy =
          RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e);
      throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e);
    }

    // Load profiler if it should be used
    if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) {

      final String profilerClassName =
          GlobalConfiguration.getString(
              ProfilingUtils.TASKMANAGER_CLASSNAME_KEY,
              "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl");

      this.profiler =
          ProfilingUtils.loadTaskManagerProfiler(
              profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo);

      if (this.profiler == null) {
        LOG.error("Cannot find class name for the profiler.");
      } else {
        LOG.info("Profiling of jobs is enabled.");
      }
    } else {
      this.profiler = null;
      LOG.info("Profiling of jobs is disabled.");
    }

    // Get the directory for storing temporary files
    final String[] tmpDirPaths =
        GlobalConfiguration.getString(
                ConfigConstants.TASK_MANAGER_TMP_DIR_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH)
            .split(",|" + File.pathSeparator);

    checkTempDirs(tmpDirPaths);

    final int pageSize =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

    // Initialize network buffer pool
    int numBuffers =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS);

    int bufferSize =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

    // Initialize the channel manager
    try {
      NetworkConnectionManager networkConnectionManager = null;

      switch (executionMode) {
        case LOCAL:
          networkConnectionManager = new LocalConnectionManager();
          break;
        case CLUSTER:
          int numInThreads =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS);

          int numOutThreads =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS);

          int lowWaterMark =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK);

          int highWaterMark =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK);

          networkConnectionManager =
              new NettyConnectionManager(
                  localInstanceConnectionInfo.address(),
                  localInstanceConnectionInfo.dataPort(),
                  bufferSize,
                  numInThreads,
                  numOutThreads,
                  lowWaterMark,
                  highWaterMark);
          break;
      }

      channelManager =
          new ChannelManager(
              lookupService,
              localInstanceConnectionInfo,
              numBuffers,
              bufferSize,
              networkConnectionManager);
    } catch (IOException ioe) {
      LOG.error(StringUtils.stringifyException(ioe));
      throw new Exception("Failed to instantiate channel manager. " + ioe.getMessage(), ioe);
    }

    {
      HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem();

      int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1);
      if (slots == -1) {
        slots = Hardware.getNumberCPUCores();
      } else if (slots <= 0) {
        throw new Exception("Illegal value for the number of task slots: " + slots);
      }
      this.numberOfSlots = slots;

      // Check whether the memory size has been explicitly configured. if so that overrides the
      // default mechanism
      // of taking as much as is mentioned in the hardware description
      long memorySize =
          GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1);

      if (memorySize > 0) {
        // manually configured memory size. override the value in the hardware config
        resources =
            HardwareDescriptionFactory.construct(
                resources.getNumberOfCPUCores(),
                resources.getSizeOfPhysicalMemory(),
                memorySize * 1024L * 1024L);
      }
      this.hardwareDescription = resources;

      // Initialize the memory manager
      LOG.info(
          "Initializing memory manager with "
              + (resources.getSizeOfFreeMemory() >>> 20)
              + " megabytes of memory. "
              + "Page size is "
              + pageSize
              + " bytes.");

      try {
        @SuppressWarnings("unused")
        final boolean lazyAllocation =
            GlobalConfiguration.getBoolean(
                ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION);

        this.memoryManager =
            new DefaultMemoryManager(resources.getSizeOfFreeMemory(), this.numberOfSlots, pageSize);
      } catch (Throwable t) {
        LOG.fatal(
            "Unable to initialize memory manager with "
                + (resources.getSizeOfFreeMemory() >>> 20)
                + " megabytes of memory.",
            t);
        throw new Exception("Unable to initialize memory manager.", t);
      }
    }

    this.ioManager = new IOManager(tmpDirPaths);

    this.heartbeatThread =
        new Thread() {
          @Override
          public void run() {
            runHeartbeatLoop();
          }
        };

    this.heartbeatThread.setName("Heartbeat Thread");
    this.heartbeatThread.start();

    // --------------------------------------------------------------------
    // Memory Usage
    // --------------------------------------------------------------------

    final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
    final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans();

    LOG.info(getMemoryUsageStatsAsString(memoryMXBean));

    boolean startMemoryUsageLogThread =
        GlobalConfiguration.getBoolean(
            ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD,
            ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD);

    if (startMemoryUsageLogThread && LOG.isDebugEnabled()) {
      final int logIntervalMs =
          GlobalConfiguration.getInteger(
              ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS,
              ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS);

      new Thread(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    while (!isShutDown()) {
                      Thread.sleep(logIntervalMs);

                      LOG.debug(getMemoryUsageStatsAsString(memoryMXBean));

                      LOG.debug(getGarbageCollectorStatsAsString(gcMXBeans));
                    }
                  } catch (InterruptedException e) {
                    LOG.warn("Unexpected interruption of memory usage logger thread.");
                  }
                }
              })
          .start();
    }
  }