/**
   * All parameters are obtained from the {@link GlobalConfiguration}, which must be loaded prior to
   * instantiating the task manager.
   */
  public TaskManager(ExecutionMode executionMode) throws Exception {
    if (executionMode == null) {
      throw new NullPointerException("Execution mode must not be null.");
    }

    RevisionInformation rev = JobManagerUtils.getRevisionInformation();
    LOG.info(
        "Starting Stratosphere TaskManager "
            + "(Version: "
            + JobManagerUtils.getVersion()
            + ", "
            + "Rev:"
            + rev.commitId
            + ", "
            + "Date:"
            + rev.commitDate
            + ")");

    try {
      LOG.info(
          "TaskManager started as user "
              + UserGroupInformation.getCurrentUser().getShortUserName());
    } catch (Throwable t) {
      LOG.error("Cannot determine user group information.", t);
    }

    LOG.info("Execution mode: " + executionMode);

    // IMPORTANT! At this point, the GlobalConfiguration must have been read!

    final InetSocketAddress jobManagerAddress;
    {
      LOG.info("Reading location of job manager from configuration");

      final String address =
          GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null);
      final int port =
          GlobalConfiguration.getInteger(
              ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
              ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);

      if (address == null) {
        throw new Exception("Job manager address not configured in the GlobalConfiguration.");
      }

      // Try to convert configured address to {@link InetAddress}
      try {
        final InetAddress tmpAddress = InetAddress.getByName(address);
        jobManagerAddress = new InetSocketAddress(tmpAddress, port);
      } catch (UnknownHostException e) {
        LOG.fatal("Could not resolve JobManager host name.");
        throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e);
      }

      LOG.info("Connecting to JobManager at: " + jobManagerAddress);
    }

    // Create RPC connection to the JobManager
    try {
      this.jobManager =
          RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e);
      throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e);
    }

    int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1);
    int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1);
    if (ipcPort == -1) {
      ipcPort = getAvailablePort();
    }
    if (dataPort == -1) {
      dataPort = getAvailablePort();
    }

    // Determine our own public facing address and start the server
    {
      final InetAddress taskManagerAddress;
      try {
        taskManagerAddress = getTaskManagerAddress(jobManagerAddress);
      } catch (Exception e) {
        throw new RuntimeException(
            "The TaskManager failed to determine its own network address.", e);
      }

      this.localInstanceConnectionInfo =
          new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort);
      LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo);

      // Start local RPC server
      try {
        this.taskManagerServer =
            RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT);
        this.taskManagerServer.start();
      } catch (IOException e) {
        LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e);
        throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e);
      }
    }

    // Try to create local stub of the global input split provider
    try {
      this.globalInputSplitProvider =
          RPC.getProxy(
              InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal(e.getMessage(), e);
      throw new Exception(
          "Failed to initialize connection to global input split provider: " + e.getMessage(), e);
    }

    // Try to create local stub for the lookup service
    try {
      this.lookupService =
          RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal(e.getMessage(), e);
      throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e);
    }

    // Try to create local stub for the accumulators
    try {
      this.accumulatorProtocolProxy =
          RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory());
    } catch (IOException e) {
      LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e);
      throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e);
    }

    // Load profiler if it should be used
    if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) {

      final String profilerClassName =
          GlobalConfiguration.getString(
              ProfilingUtils.TASKMANAGER_CLASSNAME_KEY,
              "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl");

      this.profiler =
          ProfilingUtils.loadTaskManagerProfiler(
              profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo);

      if (this.profiler == null) {
        LOG.error("Cannot find class name for the profiler.");
      } else {
        LOG.info("Profiling of jobs is enabled.");
      }
    } else {
      this.profiler = null;
      LOG.info("Profiling of jobs is disabled.");
    }

    // Get the directory for storing temporary files
    final String[] tmpDirPaths =
        GlobalConfiguration.getString(
                ConfigConstants.TASK_MANAGER_TMP_DIR_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH)
            .split(",|" + File.pathSeparator);

    checkTempDirs(tmpDirPaths);

    final int pageSize =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

    // Initialize network buffer pool
    int numBuffers =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS);

    int bufferSize =
        GlobalConfiguration.getInteger(
            ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY,
            ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE);

    // Initialize the channel manager
    try {
      NetworkConnectionManager networkConnectionManager = null;

      switch (executionMode) {
        case LOCAL:
          networkConnectionManager = new LocalConnectionManager();
          break;
        case CLUSTER:
          int numInThreads =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS);

          int numOutThreads =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS);

          int lowWaterMark =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK);

          int highWaterMark =
              GlobalConfiguration.getInteger(
                  ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK,
                  ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK);

          networkConnectionManager =
              new NettyConnectionManager(
                  localInstanceConnectionInfo.address(),
                  localInstanceConnectionInfo.dataPort(),
                  bufferSize,
                  numInThreads,
                  numOutThreads,
                  lowWaterMark,
                  highWaterMark);
          break;
      }

      channelManager =
          new ChannelManager(
              lookupService,
              localInstanceConnectionInfo,
              numBuffers,
              bufferSize,
              networkConnectionManager);
    } catch (IOException ioe) {
      LOG.error(StringUtils.stringifyException(ioe));
      throw new Exception("Failed to instantiate channel manager. " + ioe.getMessage(), ioe);
    }

    {
      HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem();

      int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1);
      if (slots == -1) {
        slots = Hardware.getNumberCPUCores();
      } else if (slots <= 0) {
        throw new Exception("Illegal value for the number of task slots: " + slots);
      }
      this.numberOfSlots = slots;

      // Check whether the memory size has been explicitly configured. if so that overrides the
      // default mechanism
      // of taking as much as is mentioned in the hardware description
      long memorySize =
          GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1);

      if (memorySize > 0) {
        // manually configured memory size. override the value in the hardware config
        resources =
            HardwareDescriptionFactory.construct(
                resources.getNumberOfCPUCores(),
                resources.getSizeOfPhysicalMemory(),
                memorySize * 1024L * 1024L);
      }
      this.hardwareDescription = resources;

      // Initialize the memory manager
      LOG.info(
          "Initializing memory manager with "
              + (resources.getSizeOfFreeMemory() >>> 20)
              + " megabytes of memory. "
              + "Page size is "
              + pageSize
              + " bytes.");

      try {
        @SuppressWarnings("unused")
        final boolean lazyAllocation =
            GlobalConfiguration.getBoolean(
                ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY,
                ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION);

        this.memoryManager =
            new DefaultMemoryManager(resources.getSizeOfFreeMemory(), this.numberOfSlots, pageSize);
      } catch (Throwable t) {
        LOG.fatal(
            "Unable to initialize memory manager with "
                + (resources.getSizeOfFreeMemory() >>> 20)
                + " megabytes of memory.",
            t);
        throw new Exception("Unable to initialize memory manager.", t);
      }
    }

    this.ioManager = new IOManager(tmpDirPaths);

    this.heartbeatThread =
        new Thread() {
          @Override
          public void run() {
            runHeartbeatLoop();
          }
        };

    this.heartbeatThread.setName("Heartbeat Thread");
    this.heartbeatThread.start();

    // --------------------------------------------------------------------
    // Memory Usage
    // --------------------------------------------------------------------

    final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
    final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans();

    LOG.info(getMemoryUsageStatsAsString(memoryMXBean));

    boolean startMemoryUsageLogThread =
        GlobalConfiguration.getBoolean(
            ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD,
            ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD);

    if (startMemoryUsageLogThread && LOG.isDebugEnabled()) {
      final int logIntervalMs =
          GlobalConfiguration.getInteger(
              ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS,
              ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS);

      new Thread(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    while (!isShutDown()) {
                      Thread.sleep(logIntervalMs);

                      LOG.debug(getMemoryUsageStatsAsString(memoryMXBean));

                      LOG.debug(getGarbageCollectorStatsAsString(gcMXBeans));
                    }
                  } catch (InterruptedException e) {
                    LOG.warn("Unexpected interruption of memory usage logger thread.");
                  }
                }
              })
          .start();
    }
  }
Esempio n. 2
0
  /**
   * Creates a new {@link ClusterInstance} object to manage instances that can be executed on that
   * host.
   *
   * @param instanceConnectionInfo the connection information for the instance
   * @param hardwareDescription the hardware description provided by the new instance
   * @return a new {@link ClusterInstance} object or <code>null</code> if the cluster instance could
   *     not be created
   */
  private ClusterInstance createNewHost(
      final InstanceConnectionInfo instanceConnectionInfo,
      final HardwareDescription hardwareDescription) {

    // Check if there is a user-defined instance type for this IP address
    InstanceType instanceType =
        this.ipToInstanceTypeMapping.get(instanceConnectionInfo.getAddress());
    if (instanceType != null) {
      LOG.info(
          "Found user-defined instance type for cluster instance with IP "
              + instanceConnectionInfo.getAddress()
              + ": "
              + instanceType);
    } else {
      instanceType = matchHardwareDescriptionWithInstanceType(hardwareDescription);
      if (instanceType != null) {
        LOG.info(
            "Hardware profile of cluster instance with IP "
                + instanceConnectionInfo.getAddress()
                + " matches with instance type "
                + instanceType);
      } else {
        LOG.error("No matching instance type, cannot create cluster instance");
        return null;
      }
    }

    // Try to match new host with a stub host from the existing topology
    String instanceName = instanceConnectionInfo.getHostName();
    NetworkNode parentNode = this.networkTopology.getRootNode();
    NetworkNode currentStubNode = null;

    // Try to match new host using the host name
    while (true) {

      currentStubNode = this.networkTopology.getNodeByName(instanceName);
      if (currentStubNode != null) {
        break;
      }

      final int pos = instanceName.lastIndexOf('.');
      if (pos == -1) {
        break;
      }

      /*
       * If host name is reported as FQDN, iterative remove parts
       * of the domain name until a match occurs or no more dots
       * can be found in the host name.
       */
      instanceName = instanceName.substring(0, pos);
    }

    // Try to match the new host using the IP address
    if (currentStubNode == null) {
      instanceName = instanceConnectionInfo.getAddress().toString();
      instanceName = instanceName.replaceAll("/", ""); // Remove any / characters
      currentStubNode = this.networkTopology.getNodeByName(instanceName);
    }

    if (currentStubNode != null) {
      /*
       * The instance name will be the same as the one of the stub node. That way
       * the stub now will be removed from the network topology and replaced be
       * the new node.
       */
      if (currentStubNode.getParentNode() != null) {
        parentNode = currentStubNode.getParentNode();
      }
      // Remove the stub node from the tree
      currentStubNode.remove();
    }

    LOG.info(
        "Creating instance of type "
            + instanceType
            + " for "
            + instanceConnectionInfo
            + ", parent is "
            + parentNode.getName());
    final ClusterInstance host =
        new ClusterInstance(
            instanceConnectionInfo,
            instanceType,
            parentNode,
            this.networkTopology,
            hardwareDescription);

    return host;
  }