/** * All parameters are obtained from the {@link GlobalConfiguration}, which must be loaded prior to * instantiating the task manager. */ public TaskManager(ExecutionMode executionMode) throws Exception { if (executionMode == null) { throw new NullPointerException("Execution mode must not be null."); } RevisionInformation rev = JobManagerUtils.getRevisionInformation(); LOG.info( "Starting Stratosphere TaskManager " + "(Version: " + JobManagerUtils.getVersion() + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); try { LOG.info( "TaskManager started as user " + UserGroupInformation.getCurrentUser().getShortUserName()); } catch (Throwable t) { LOG.error("Cannot determine user group information.", t); } LOG.info("Execution mode: " + executionMode); // IMPORTANT! At this point, the GlobalConfiguration must have been read! final InetSocketAddress jobManagerAddress; { LOG.info("Reading location of job manager from configuration"); final String address = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null); final int port = GlobalConfiguration.getInteger( ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT); if (address == null) { throw new Exception("Job manager address not configured in the GlobalConfiguration."); } // Try to convert configured address to {@link InetAddress} try { final InetAddress tmpAddress = InetAddress.getByName(address); jobManagerAddress = new InetSocketAddress(tmpAddress, port); } catch (UnknownHostException e) { LOG.fatal("Could not resolve JobManager host name."); throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e); } LOG.info("Connecting to JobManager at: " + jobManagerAddress); } // Create RPC connection to the JobManager try { this.jobManager = RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e); throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e); } int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1); int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1); if (ipcPort == -1) { ipcPort = getAvailablePort(); } if (dataPort == -1) { dataPort = getAvailablePort(); } // Determine our own public facing address and start the server { final InetAddress taskManagerAddress; try { taskManagerAddress = getTaskManagerAddress(jobManagerAddress); } catch (Exception e) { throw new RuntimeException( "The TaskManager failed to determine its own network address.", e); } this.localInstanceConnectionInfo = new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort); LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo); // Start local RPC server try { this.taskManagerServer = RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT); this.taskManagerServer.start(); } catch (IOException e) { LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e); throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e); } } // Try to create local stub of the global input split provider try { this.globalInputSplitProvider = RPC.getProxy( InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception( "Failed to initialize connection to global input split provider: " + e.getMessage(), e); } // Try to create local stub for the lookup service try { this.lookupService = RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e); } // Try to create local stub for the accumulators try { this.accumulatorProtocolProxy = RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e); throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e); } // Load profiler if it should be used if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) { final String profilerClassName = GlobalConfiguration.getString( ProfilingUtils.TASKMANAGER_CLASSNAME_KEY, "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl"); this.profiler = ProfilingUtils.loadTaskManagerProfiler( profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo); if (this.profiler == null) { LOG.error("Cannot find class name for the profiler."); } else { LOG.info("Profiling of jobs is enabled."); } } else { this.profiler = null; LOG.info("Profiling of jobs is disabled."); } // Get the directory for storing temporary files final String[] tmpDirPaths = GlobalConfiguration.getString( ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH) .split(",|" + File.pathSeparator); checkTempDirs(tmpDirPaths); final int pageSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize network buffer pool int numBuffers = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS); int bufferSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize the channel manager try { NetworkConnectionManager networkConnectionManager = null; switch (executionMode) { case LOCAL: networkConnectionManager = new LocalConnectionManager(); break; case CLUSTER: int numInThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS); int numOutThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS); int lowWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK); int highWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK); networkConnectionManager = new NettyConnectionManager( localInstanceConnectionInfo.address(), localInstanceConnectionInfo.dataPort(), bufferSize, numInThreads, numOutThreads, lowWaterMark, highWaterMark); break; } channelManager = new ChannelManager( lookupService, localInstanceConnectionInfo, numBuffers, bufferSize, networkConnectionManager); } catch (IOException ioe) { LOG.error(StringUtils.stringifyException(ioe)); throw new Exception("Failed to instantiate channel manager. " + ioe.getMessage(), ioe); } { HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem(); int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1); if (slots == -1) { slots = Hardware.getNumberCPUCores(); } else if (slots <= 0) { throw new Exception("Illegal value for the number of task slots: " + slots); } this.numberOfSlots = slots; // Check whether the memory size has been explicitly configured. if so that overrides the // default mechanism // of taking as much as is mentioned in the hardware description long memorySize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1); if (memorySize > 0) { // manually configured memory size. override the value in the hardware config resources = HardwareDescriptionFactory.construct( resources.getNumberOfCPUCores(), resources.getSizeOfPhysicalMemory(), memorySize * 1024L * 1024L); } this.hardwareDescription = resources; // Initialize the memory manager LOG.info( "Initializing memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory. " + "Page size is " + pageSize + " bytes."); try { @SuppressWarnings("unused") final boolean lazyAllocation = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION); this.memoryManager = new DefaultMemoryManager(resources.getSizeOfFreeMemory(), this.numberOfSlots, pageSize); } catch (Throwable t) { LOG.fatal( "Unable to initialize memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory.", t); throw new Exception("Unable to initialize memory manager.", t); } } this.ioManager = new IOManager(tmpDirPaths); this.heartbeatThread = new Thread() { @Override public void run() { runHeartbeatLoop(); } }; this.heartbeatThread.setName("Heartbeat Thread"); this.heartbeatThread.start(); // -------------------------------------------------------------------- // Memory Usage // -------------------------------------------------------------------- final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean(); final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans(); LOG.info(getMemoryUsageStatsAsString(memoryMXBean)); boolean startMemoryUsageLogThread = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD); if (startMemoryUsageLogThread && LOG.isDebugEnabled()) { final int logIntervalMs = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS); new Thread( new Runnable() { @Override public void run() { try { while (!isShutDown()) { Thread.sleep(logIntervalMs); LOG.debug(getMemoryUsageStatsAsString(memoryMXBean)); LOG.debug(getGarbageCollectorStatsAsString(gcMXBeans)); } } catch (InterruptedException e) { LOG.warn("Unexpected interruption of memory usage logger thread."); } } }) .start(); } }
/** * Creates a new {@link ClusterInstance} object to manage instances that can be executed on that * host. * * @param instanceConnectionInfo the connection information for the instance * @param hardwareDescription the hardware description provided by the new instance * @return a new {@link ClusterInstance} object or <code>null</code> if the cluster instance could * not be created */ private ClusterInstance createNewHost( final InstanceConnectionInfo instanceConnectionInfo, final HardwareDescription hardwareDescription) { // Check if there is a user-defined instance type for this IP address InstanceType instanceType = this.ipToInstanceTypeMapping.get(instanceConnectionInfo.getAddress()); if (instanceType != null) { LOG.info( "Found user-defined instance type for cluster instance with IP " + instanceConnectionInfo.getAddress() + ": " + instanceType); } else { instanceType = matchHardwareDescriptionWithInstanceType(hardwareDescription); if (instanceType != null) { LOG.info( "Hardware profile of cluster instance with IP " + instanceConnectionInfo.getAddress() + " matches with instance type " + instanceType); } else { LOG.error("No matching instance type, cannot create cluster instance"); return null; } } // Try to match new host with a stub host from the existing topology String instanceName = instanceConnectionInfo.getHostName(); NetworkNode parentNode = this.networkTopology.getRootNode(); NetworkNode currentStubNode = null; // Try to match new host using the host name while (true) { currentStubNode = this.networkTopology.getNodeByName(instanceName); if (currentStubNode != null) { break; } final int pos = instanceName.lastIndexOf('.'); if (pos == -1) { break; } /* * If host name is reported as FQDN, iterative remove parts * of the domain name until a match occurs or no more dots * can be found in the host name. */ instanceName = instanceName.substring(0, pos); } // Try to match the new host using the IP address if (currentStubNode == null) { instanceName = instanceConnectionInfo.getAddress().toString(); instanceName = instanceName.replaceAll("/", ""); // Remove any / characters currentStubNode = this.networkTopology.getNodeByName(instanceName); } if (currentStubNode != null) { /* * The instance name will be the same as the one of the stub node. That way * the stub now will be removed from the network topology and replaced be * the new node. */ if (currentStubNode.getParentNode() != null) { parentNode = currentStubNode.getParentNode(); } // Remove the stub node from the tree currentStubNode.remove(); } LOG.info( "Creating instance of type " + instanceType + " for " + instanceConnectionInfo + ", parent is " + parentNode.getName()); final ClusterInstance host = new ClusterInstance( instanceConnectionInfo, instanceType, parentNode, this.networkTopology, hardwareDescription); return host; }