/** * Attempts to match the hardware characteristics provided by the {@link HardwareDescription} * object with one of the instance types set in the configuration. The matching is pessimistic, * i.e. the hardware characteristics of the chosen instance type never exceed the actually * reported characteristics from the hardware description. * * @param hardwareDescription the hardware description as reported by the instance * @return the best matching instance type or <code>null</code> if no matching instance type can * be found */ private InstanceType matchHardwareDescriptionWithInstanceType( final HardwareDescription hardwareDescription) { // Assumes that the available instance types are ordered by number of CPU cores in descending // order for (int i = 0; i < this.availableInstanceTypes.length; i++) { final InstanceType candidateInstanceType = this.availableInstanceTypes[i]; // Check if number of CPU cores match if (candidateInstanceType.getNumberOfCores() > hardwareDescription.getNumberOfCPUCores()) { continue; } // Check if size of physical memory matches final int memoryInMB = (int) (hardwareDescription.getSizeOfPhysicalMemory() / (1024L * 1024L)); if (candidateInstanceType.getMemorySize() > memoryInMB) { continue; } return candidateInstanceType; } LOG.error( "Cannot find matching instance type for hardware description (" + hardwareDescription.getNumberOfCPUCores() + " cores, " + hardwareDescription.getSizeOfPhysicalMemory() + " bytes of memory)"); return null; }
/** * All parameters are obtained from the {@link GlobalConfiguration}, which must be loaded prior to * instantiating the task manager. */ public TaskManager(ExecutionMode executionMode) throws Exception { if (executionMode == null) { throw new NullPointerException("Execution mode must not be null."); } RevisionInformation rev = JobManagerUtils.getRevisionInformation(); LOG.info( "Starting Stratosphere TaskManager " + "(Version: " + JobManagerUtils.getVersion() + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); try { LOG.info( "TaskManager started as user " + UserGroupInformation.getCurrentUser().getShortUserName()); } catch (Throwable t) { LOG.error("Cannot determine user group information.", t); } LOG.info("Execution mode: " + executionMode); // IMPORTANT! At this point, the GlobalConfiguration must have been read! final InetSocketAddress jobManagerAddress; { LOG.info("Reading location of job manager from configuration"); final String address = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null); final int port = GlobalConfiguration.getInteger( ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT); if (address == null) { throw new Exception("Job manager address not configured in the GlobalConfiguration."); } // Try to convert configured address to {@link InetAddress} try { final InetAddress tmpAddress = InetAddress.getByName(address); jobManagerAddress = new InetSocketAddress(tmpAddress, port); } catch (UnknownHostException e) { LOG.fatal("Could not resolve JobManager host name."); throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e); } LOG.info("Connecting to JobManager at: " + jobManagerAddress); } // Create RPC connection to the JobManager try { this.jobManager = RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e); throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e); } int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1); int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1); if (ipcPort == -1) { ipcPort = getAvailablePort(); } if (dataPort == -1) { dataPort = getAvailablePort(); } // Determine our own public facing address and start the server { final InetAddress taskManagerAddress; try { taskManagerAddress = getTaskManagerAddress(jobManagerAddress); } catch (Exception e) { throw new RuntimeException( "The TaskManager failed to determine its own network address.", e); } this.localInstanceConnectionInfo = new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort); LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo); // Start local RPC server try { this.taskManagerServer = RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT); this.taskManagerServer.start(); } catch (IOException e) { LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e); throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e); } } // Try to create local stub of the global input split provider try { this.globalInputSplitProvider = RPC.getProxy( InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception( "Failed to initialize connection to global input split provider: " + e.getMessage(), e); } // Try to create local stub for the lookup service try { this.lookupService = RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e); } // Try to create local stub for the accumulators try { this.accumulatorProtocolProxy = RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e); throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e); } // Load profiler if it should be used if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) { final String profilerClassName = GlobalConfiguration.getString( ProfilingUtils.TASKMANAGER_CLASSNAME_KEY, "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl"); this.profiler = ProfilingUtils.loadTaskManagerProfiler( profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo); if (this.profiler == null) { LOG.error("Cannot find class name for the profiler."); } else { LOG.info("Profiling of jobs is enabled."); } } else { this.profiler = null; LOG.info("Profiling of jobs is disabled."); } // Get the directory for storing temporary files final String[] tmpDirPaths = GlobalConfiguration.getString( ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH) .split(",|" + File.pathSeparator); checkTempDirs(tmpDirPaths); final int pageSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize network buffer pool int numBuffers = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS); int bufferSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize the channel manager try { NetworkConnectionManager networkConnectionManager = null; switch (executionMode) { case LOCAL: networkConnectionManager = new LocalConnectionManager(); break; case CLUSTER: int numInThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS); int numOutThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS); int lowWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK); int highWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK); networkConnectionManager = new NettyConnectionManager( localInstanceConnectionInfo.address(), localInstanceConnectionInfo.dataPort(), bufferSize, numInThreads, numOutThreads, lowWaterMark, highWaterMark); break; } channelManager = new ChannelManager( lookupService, localInstanceConnectionInfo, numBuffers, bufferSize, networkConnectionManager); } catch (IOException ioe) { LOG.error(StringUtils.stringifyException(ioe)); throw new Exception("Failed to instantiate channel manager. " + ioe.getMessage(), ioe); } { HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem(); int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1); if (slots == -1) { slots = Hardware.getNumberCPUCores(); } else if (slots <= 0) { throw new Exception("Illegal value for the number of task slots: " + slots); } this.numberOfSlots = slots; // Check whether the memory size has been explicitly configured. if so that overrides the // default mechanism // of taking as much as is mentioned in the hardware description long memorySize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1); if (memorySize > 0) { // manually configured memory size. override the value in the hardware config resources = HardwareDescriptionFactory.construct( resources.getNumberOfCPUCores(), resources.getSizeOfPhysicalMemory(), memorySize * 1024L * 1024L); } this.hardwareDescription = resources; // Initialize the memory manager LOG.info( "Initializing memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory. " + "Page size is " + pageSize + " bytes."); try { @SuppressWarnings("unused") final boolean lazyAllocation = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION); this.memoryManager = new DefaultMemoryManager(resources.getSizeOfFreeMemory(), this.numberOfSlots, pageSize); } catch (Throwable t) { LOG.fatal( "Unable to initialize memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory.", t); throw new Exception("Unable to initialize memory manager.", t); } } this.ioManager = new IOManager(tmpDirPaths); this.heartbeatThread = new Thread() { @Override public void run() { runHeartbeatLoop(); } }; this.heartbeatThread.setName("Heartbeat Thread"); this.heartbeatThread.start(); // -------------------------------------------------------------------- // Memory Usage // -------------------------------------------------------------------- final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean(); final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans(); LOG.info(getMemoryUsageStatsAsString(memoryMXBean)); boolean startMemoryUsageLogThread = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD); if (startMemoryUsageLogThread && LOG.isDebugEnabled()) { final int logIntervalMs = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS); new Thread( new Runnable() { @Override public void run() { try { while (!isShutDown()) { Thread.sleep(logIntervalMs); LOG.debug(getMemoryUsageStatsAsString(memoryMXBean)); LOG.debug(getGarbageCollectorStatsAsString(gcMXBeans)); } } catch (InterruptedException e) { LOG.warn("Unexpected interruption of memory usage logger thread."); } } }) .start(); } }
/** Updates the list of instance type descriptions based on the currently registered hosts. */ private void updateInstaceTypeDescriptionMap() { // this.registeredHosts.values().iterator() this.instanceTypeDescriptionMap.clear(); final List<InstanceTypeDescription> instanceTypeDescriptionList = new ArrayList<InstanceTypeDescription>(); // initialize array which stores the availability counter for each instance type final int[] numberOfInstances = new int[this.availableInstanceTypes.length]; for (int i = 0; i < numberOfInstances.length; i++) { numberOfInstances[i] = 0; } // Shuffle through instance types for (int i = 0; i < this.availableInstanceTypes.length; i++) { final InstanceType currentInstanceType = this.availableInstanceTypes[i]; int numberOfMatchingInstances = 0; int minNumberOfCPUCores = Integer.MAX_VALUE; long minSizeOfPhysicalMemory = Long.MAX_VALUE; long minSizeOfFreeMemory = Long.MAX_VALUE; final Iterator<ClusterInstance> it = this.registeredHosts.values().iterator(); while (it.hasNext()) { final ClusterInstance clusterInstance = it.next(); if (clusterInstance.getType().equals(currentInstanceType)) { ++numberOfMatchingInstances; final HardwareDescription hardwareDescription = clusterInstance.getHardwareDescription(); minNumberOfCPUCores = Math.min(minNumberOfCPUCores, hardwareDescription.getNumberOfCPUCores()); minSizeOfPhysicalMemory = Math.min(minSizeOfPhysicalMemory, hardwareDescription.getSizeOfPhysicalMemory()); minSizeOfFreeMemory = Math.min(minSizeOfFreeMemory, hardwareDescription.getSizeOfFreeMemory()); } } // Update number of instances int highestAccommodationNumber = -1; int highestAccommodationIndex = -1; for (int j = 0; j < this.availableInstanceTypes.length; j++) { final int accommodationNumber = canBeAccommodated(j, i); // LOG.debug(this.availableInstanceTypes[j].getIdentifier() + " fits into " // + this.availableInstanceTypes[i].getIdentifier() + " " + accommodationNumber + " times"); if (accommodationNumber > 0) { numberOfInstances[j] += numberOfMatchingInstances * accommodationNumber; if (accommodationNumber > highestAccommodationNumber) { highestAccommodationNumber = accommodationNumber; highestAccommodationIndex = j; } } } // Calculate hardware description HardwareDescription pessimisticHardwareDescription = null; if (minNumberOfCPUCores < Integer.MAX_VALUE && minSizeOfPhysicalMemory < Long.MAX_VALUE && minSizeOfFreeMemory < Long.MAX_VALUE) { pessimisticHardwareDescription = HardwareDescriptionFactory.construct( minNumberOfCPUCores, minSizeOfPhysicalMemory, minSizeOfFreeMemory); } else { if (highestAccommodationIndex < i) { // Since highestAccommodationIndex smaller than my index, the // target instance must be more powerful final InstanceTypeDescription descriptionOfLargerInstanceType = instanceTypeDescriptionList.get(highestAccommodationIndex); if (descriptionOfLargerInstanceType.getHardwareDescription() != null) { final HardwareDescription hardwareDescriptionOfLargerInstanceType = descriptionOfLargerInstanceType.getHardwareDescription(); final int numCores = hardwareDescriptionOfLargerInstanceType.getNumberOfCPUCores() / highestAccommodationNumber; final long physMem = hardwareDescriptionOfLargerInstanceType.getSizeOfPhysicalMemory() / highestAccommodationNumber; final long freeMem = hardwareDescriptionOfLargerInstanceType.getSizeOfFreeMemory() / highestAccommodationNumber; pessimisticHardwareDescription = HardwareDescriptionFactory.construct(numCores, physMem, freeMem); } } } instanceTypeDescriptionList.add( InstanceTypeDescriptionFactory.construct( currentInstanceType, pessimisticHardwareDescription, numberOfInstances[i])); } final Iterator<InstanceTypeDescription> it = instanceTypeDescriptionList.iterator(); while (it.hasNext()) { final InstanceTypeDescription itd = it.next(); this.instanceTypeDescriptionMap.put(itd.getInstanceType(), itd); } }