/** Constructor. */ public ClusterManager() { this.registeredHosts = new HashMap<InstanceConnectionInfo, ClusterInstance>(); this.slicesOfJobs = new HashMap<JobID, List<AllocatedSlice>>(); // Load the instance type this cluster can offer this.availableInstanceTypes = populateInstanceTypeArray(); this.instanceAccommodationMatrix = calculateInstanceAccommodationMatrix(); this.instanceTypeDescriptionMap = new SerializableHashMap<InstanceType, InstanceTypeDescription>(); long tmpCleanUpInterval = (long) GlobalConfiguration.getInteger(CLEANUP_INTERVAL_KEY, DEFAULT_CLEANUP_INTERVAL) * 1000; if (tmpCleanUpInterval < 10) { // Clean up interval must be at least ten seconds LOG.warn( "Invalid clean up interval. Reverting to default cleanup interval of " + DEFAULT_CLEANUP_INTERVAL + " secs."); tmpCleanUpInterval = DEFAULT_CLEANUP_INTERVAL; } this.cleanUpInterval = tmpCleanUpInterval; int tmpDefaultInstanceTypeIndex = GlobalConfiguration.getInteger( DEFAULT_INSTANCE_TYPE_INDEX_KEY, ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX); if (tmpDefaultInstanceTypeIndex > this.availableInstanceTypes.length) { LOG.warn( "Incorrect index to for default instance type (" + tmpDefaultInstanceTypeIndex + "), switching to default index " + ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX); tmpDefaultInstanceTypeIndex = ConfigConstants.DEFAULT_DEFAULT_INSTANCE_TYPE_INDEX; } this.defaultInstanceType = this.availableInstanceTypes[tmpDefaultInstanceTypeIndex - 1]; // sort available instances by CPU core sortAvailableInstancesByNumberOfCPUCores(); // load the network topology from the slave file this.networkTopology = loadNetworkTopology(); // load IP to instance type mapping from slave file loadIPToInstanceTypeMapping(); // look every BASEINTERVAL milliseconds for crashed hosts final boolean runTimerAsDaemon = true; new Timer(runTimerAsDaemon).schedule(cleanupStaleMachines, 1000, 1000); // Load available instance types into the instance description list updateInstaceTypeDescriptionMap(); }
/** Reads the IP to instance type mapping from the slave file. */ private void loadIPToInstanceTypeMapping() { final String configDir = GlobalConfiguration.getString(CONFIG_DIR_KEY, null); if (configDir == null) { LOG.info( "Cannot find configuration directory to read IP to instance type mapping. Using default types."); return; } final File slaveFile = new File(configDir + File.separator + SLAVE_FILE_NAME); if (!slaveFile.exists()) { LOG.error("Cannot access slave file to read IP to instance type mapping"); return; } try { final BufferedReader input = new BufferedReader(new FileReader(slaveFile)); String line = null; while ((line = input.readLine()) != null) { final Matcher m = IP_TO_INSTANCE_TYPE_PATTERN.matcher(line); if (!m.matches()) { LOG.error("Entry does not match format: " + line); continue; } InetAddress address = null; String host = m.group(1); try { final int pos = host.lastIndexOf('/'); if (pos != -1) { host = host.substring(pos + 1); } address = InetAddress.getByName(host); } catch (UnknownHostException e) { LOG.error("Cannot resolve " + host + " to a hostname/IP address", e); continue; } InstanceType instanceType = null; String instanceTypeName = m.group(2); if (instanceTypeName != null && instanceTypeName.length() > 0) { instanceType = getInstanceTypeByName(instanceTypeName); if (instanceType != null) { this.ipToInstanceTypeMapping.put(address, instanceType); } } } input.close(); } catch (IOException e) { LOG.error("Cannot load IP to instance type mapping from file " + e); } }
/** This method send the periodic heartbeats. */ private void runHeartbeatLoop() { final long interval = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_HEARTBEAT_INTERVAL_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_HEARTBEAT_INTERVAL); try { while (!shutdownStarted.get()) { RegisterTaskManagerResult result = this.jobManager.registerTaskManager( this.localInstanceConnectionInfo, this.hardwareDescription, new IntegerRecord(this.numberOfSlots)); if (result.getReturnCode() == RegisterTaskManagerResult.ReturnCode.SUCCESS) { break; } try { Thread.sleep(50); } catch (InterruptedException e) { if (!shutdownStarted.get()) { LOG.error("TaskManager register task manager loop was interrupted without shutdown."); } } } } catch (IOException e) { if (!shutdownStarted.get()) { LOG.error("Registering task manager caused an exception: " + e.getMessage(), e); } return; } while (!shutdownStarted.get()) { // sleep until the next heart beat try { Thread.sleep(interval); } catch (InterruptedException e) { if (!shutdownStarted.get()) { LOG.error("TaskManager heart beat loop was interrupted without shutdown."); } } // send heart beat try { LOG.debug("heartbeat"); this.jobManager.sendHeartbeat(this.localInstanceConnectionInfo); } catch (IOException e) { if (shutdownStarted.get()) { break; } else { LOG.error("Sending the heart beat caused an exception: " + e.getMessage(), e); } } } }
/** * Reads the instance types configured in the config file. Each instance type is defined by a * key/value pair. The format of the key is <code>instancemanager.cluster.type.X</code> where X is * an ongoing integer number starting at 1. The format of the value follows the pattern * "instancename,numComputeUnits,numCores,memorySize,diskCapacity,pricePerHour" (see {@link * InstanceType}). * * @return list of available instance types sorted by price (cheapest to most expensive) */ private InstanceType[] populateInstanceTypeArray() { final List<InstanceType> instanceTypes = new ArrayList<InstanceType>(); // read instance types int count = 1; while (true) { final String key = INSTANCE_TYPE_PREFIX_KEY + Integer.toString(count); String descr = GlobalConfiguration.getString(key, null); if (descr == null) { if (count == 1) { LOG.info( "Configuration does not contain at least one definition for an instance type, " + "using default instance type: " + ConfigConstants.DEFAULT_INSTANCE_TYPE); descr = ConfigConstants.DEFAULT_INSTANCE_TYPE; } else { break; } } // parse entry try { // if successful add new instance type final InstanceType instanceType = InstanceTypeFactory.constructFromDescription(descr); LOG.info( "Loaded instance type " + instanceType.getIdentifier() + " from the configuration"); instanceTypes.add(instanceType); } catch (Throwable t) { LOG.error( "Error parsing " + key + ":" + descr + ". Using default using default instance type: " + ConfigConstants.DEFAULT_INSTANCE_TYPE + " for instance type " + count + ".", t); break; } // Increase key index ++count; } return instanceTypes.toArray(new InstanceType[instanceTypes.size()]); }
public static Configuration initializeYarnConfiguration() { Configuration conf = new YarnConfiguration(); String configuredHadoopConfig = GlobalConfiguration.getString(ConfigConstants.PATH_HADOOP_CONFIG, null); if (configuredHadoopConfig != null) { LOG.info( "Using hadoop configuration path from " + ConfigConstants.PATH_HADOOP_CONFIG + " setting."); addPathToConfig(conf, new File(configuredHadoopConfig)); setDefaultConfValues(conf); return conf; } String[] envs = {"YARN_CONF_DIR", "HADOOP_CONF_DIR", "HADOOP_CONF_PATH"}; for (int i = 0; i < envs.length; ++i) { String confPath = System.getenv(envs[i]); if (confPath != null) { LOG.info("Found " + envs[i] + ", adding it to configuration"); addPathToConfig(conf, new File(confPath)); setDefaultConfValues(conf); return conf; } } LOG.info("Could not find HADOOP_CONF_PATH, using HADOOP_HOME."); String hadoopHome = null; try { hadoopHome = Shell.getHadoopHome(); } catch (IOException e) { LOG.fatal("Unable to get hadoop home. Please set HADOOP_HOME variable!", e); System.exit(1); } File tryConf = new File(hadoopHome + "/etc/hadoop"); if (tryConf.exists()) { LOG.info("Found configuration using hadoop home."); addPathToConfig(conf, tryConf); } else { tryConf = new File(hadoopHome + "/conf"); if (tryConf.exists()) { addPathToConfig(conf, tryConf); } } setDefaultConfValues(conf); return conf; }
/** * Attempts to load the current network topology from the slave file. If locating or reading the * slave file fails, the method will return an empty network topology. * * @return the network topology as read from the slave file */ private NetworkTopology loadNetworkTopology() { // Check if slave file exists final String configDir = GlobalConfiguration.getString(CONFIG_DIR_KEY, null); if (configDir == null) { LOG.info( "Cannot find configuration directory to load network topology. Using flat topology."); return NetworkTopology.createEmptyTopology(); } final File slaveFile = new File(configDir + File.separator + SLAVE_FILE_NAME); if (!slaveFile.exists()) { LOG.error("Cannot access slave file to load network topology, using flat topology instead"); return NetworkTopology.createEmptyTopology(); } try { return NetworkTopology.fromFile(slaveFile); } catch (IOException ioe) { LOG.error("Error while loading the network topology: " + StringUtils.stringifyException(ioe)); } return NetworkTopology.createEmptyTopology(); }
/** * Entry point for the program. * * @param args arguments from the command line * @throws IOException */ @SuppressWarnings("static-access") public static void main(String[] args) throws IOException { Option configDirOpt = OptionBuilder.withArgName("config directory") .hasArg() .withDescription("Specify configuration directory.") .create("configDir"); // tempDir option is used by the YARN client. Option tempDir = OptionBuilder.withArgName("temporary directory (overwrites configured option)") .hasArg() .withDescription("Specify temporary directory.") .create(ARG_CONF_DIR); configDirOpt.setRequired(true); tempDir.setRequired(false); Options options = new Options(); options.addOption(configDirOpt); options.addOption(tempDir); CommandLineParser parser = new GnuParser(); CommandLine line = null; try { line = parser.parse(options, args); } catch (ParseException e) { System.err.println("CLI Parsing failed. Reason: " + e.getMessage()); System.exit(FAILURE_RETURN_CODE); } String configDir = line.getOptionValue(configDirOpt.getOpt(), null); String tempDirVal = line.getOptionValue(tempDir.getOpt(), null); // First, try to load global configuration GlobalConfiguration.loadConfiguration(configDir); if (tempDirVal != null // the YARN TM runner has set a value for the temp dir // the configuration does not contain a temp direcory && GlobalConfiguration.getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, null) == null) { Configuration c = GlobalConfiguration.getConfiguration(); c.setString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, tempDirVal); LOG.info("Setting temporary directory to " + tempDirVal); GlobalConfiguration.includeConfiguration(c); } System.err.println("Configuration " + GlobalConfiguration.getConfiguration()); LOG.info("Current user " + UserGroupInformation.getCurrentUser().getShortUserName()); { // log the available JVM memory long maxMemoryMiBytes = Runtime.getRuntime().maxMemory() >>> 20; LOG.info( "Starting TaskManager in a JVM with " + maxMemoryMiBytes + " MiBytes maximum heap size."); } // Create a new task manager object try { new TaskManager(ExecutionMode.CLUSTER); } catch (Exception e) { LOG.fatal("Taskmanager startup failed: " + e.getMessage(), e); System.exit(FAILURE_RETURN_CODE); } // park the main thread to keep the JVM alive (all other threads may be daemon threads) Object mon = new Object(); synchronized (mon) { try { mon.wait(); } catch (InterruptedException ex) { } } }
/** * All parameters are obtained from the {@link GlobalConfiguration}, which must be loaded prior to * instantiating the task manager. */ public TaskManager(ExecutionMode executionMode) throws Exception { if (executionMode == null) { throw new NullPointerException("Execution mode must not be null."); } RevisionInformation rev = JobManagerUtils.getRevisionInformation(); LOG.info( "Starting Stratosphere TaskManager " + "(Version: " + JobManagerUtils.getVersion() + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); try { LOG.info( "TaskManager started as user " + UserGroupInformation.getCurrentUser().getShortUserName()); } catch (Throwable t) { LOG.error("Cannot determine user group information.", t); } LOG.info("Execution mode: " + executionMode); // IMPORTANT! At this point, the GlobalConfiguration must have been read! final InetSocketAddress jobManagerAddress; { LOG.info("Reading location of job manager from configuration"); final String address = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null); final int port = GlobalConfiguration.getInteger( ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT); if (address == null) { throw new Exception("Job manager address not configured in the GlobalConfiguration."); } // Try to convert configured address to {@link InetAddress} try { final InetAddress tmpAddress = InetAddress.getByName(address); jobManagerAddress = new InetSocketAddress(tmpAddress, port); } catch (UnknownHostException e) { LOG.fatal("Could not resolve JobManager host name."); throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e); } LOG.info("Connecting to JobManager at: " + jobManagerAddress); } // Create RPC connection to the JobManager try { this.jobManager = RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e); throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e); } int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1); int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1); if (ipcPort == -1) { ipcPort = getAvailablePort(); } if (dataPort == -1) { dataPort = getAvailablePort(); } // Determine our own public facing address and start the server { final InetAddress taskManagerAddress; try { taskManagerAddress = getTaskManagerAddress(jobManagerAddress); } catch (Exception e) { throw new RuntimeException( "The TaskManager failed to determine its own network address.", e); } this.localInstanceConnectionInfo = new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort); LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo); // Start local RPC server try { this.taskManagerServer = RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT); this.taskManagerServer.start(); } catch (IOException e) { LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e); throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e); } } // Try to create local stub of the global input split provider try { this.globalInputSplitProvider = RPC.getProxy( InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception( "Failed to initialize connection to global input split provider: " + e.getMessage(), e); } // Try to create local stub for the lookup service try { this.lookupService = RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e); } // Try to create local stub for the accumulators try { this.accumulatorProtocolProxy = RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e); throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e); } // Load profiler if it should be used if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) { final String profilerClassName = GlobalConfiguration.getString( ProfilingUtils.TASKMANAGER_CLASSNAME_KEY, "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl"); this.profiler = ProfilingUtils.loadTaskManagerProfiler( profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo); if (this.profiler == null) { LOG.error("Cannot find class name for the profiler."); } else { LOG.info("Profiling of jobs is enabled."); } } else { this.profiler = null; LOG.info("Profiling of jobs is disabled."); } // Get the directory for storing temporary files final String[] tmpDirPaths = GlobalConfiguration.getString( ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH) .split(",|" + File.pathSeparator); checkTempDirs(tmpDirPaths); final int pageSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize network buffer pool int numBuffers = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS); int bufferSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize the channel manager try { NetworkConnectionManager networkConnectionManager = null; switch (executionMode) { case LOCAL: networkConnectionManager = new LocalConnectionManager(); break; case CLUSTER: int numInThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_IN_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_IN_THREADS); int numOutThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NUM_OUT_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NUM_OUT_THREADS); int lowWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_LOW_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_LOW_WATER_MARK); int highWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NET_NETTY_HIGH_WATER_MARK); networkConnectionManager = new NettyConnectionManager( localInstanceConnectionInfo.address(), localInstanceConnectionInfo.dataPort(), bufferSize, numInThreads, numOutThreads, lowWaterMark, highWaterMark); break; } channelManager = new ChannelManager( lookupService, localInstanceConnectionInfo, numBuffers, bufferSize, networkConnectionManager); } catch (IOException ioe) { LOG.error(StringUtils.stringifyException(ioe)); throw new Exception("Failed to instantiate channel manager. " + ioe.getMessage(), ioe); } { HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem(); int slots = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, -1); if (slots == -1) { slots = Hardware.getNumberCPUCores(); } else if (slots <= 0) { throw new Exception("Illegal value for the number of task slots: " + slots); } this.numberOfSlots = slots; // Check whether the memory size has been explicitly configured. if so that overrides the // default mechanism // of taking as much as is mentioned in the hardware description long memorySize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1); if (memorySize > 0) { // manually configured memory size. override the value in the hardware config resources = HardwareDescriptionFactory.construct( resources.getNumberOfCPUCores(), resources.getSizeOfPhysicalMemory(), memorySize * 1024L * 1024L); } this.hardwareDescription = resources; // Initialize the memory manager LOG.info( "Initializing memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory. " + "Page size is " + pageSize + " bytes."); try { @SuppressWarnings("unused") final boolean lazyAllocation = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION); this.memoryManager = new DefaultMemoryManager(resources.getSizeOfFreeMemory(), this.numberOfSlots, pageSize); } catch (Throwable t) { LOG.fatal( "Unable to initialize memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory.", t); throw new Exception("Unable to initialize memory manager.", t); } } this.ioManager = new IOManager(tmpDirPaths); this.heartbeatThread = new Thread() { @Override public void run() { runHeartbeatLoop(); } }; this.heartbeatThread.setName("Heartbeat Thread"); this.heartbeatThread.start(); // -------------------------------------------------------------------- // Memory Usage // -------------------------------------------------------------------- final MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean(); final List<GarbageCollectorMXBean> gcMXBeans = ManagementFactory.getGarbageCollectorMXBeans(); LOG.info(getMemoryUsageStatsAsString(memoryMXBean)); boolean startMemoryUsageLogThread = GlobalConfiguration.getBoolean( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_START_LOG_THREAD); if (startMemoryUsageLogThread && LOG.isDebugEnabled()) { final int logIntervalMs = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS, ConfigConstants.DEFAULT_TASK_MANAGER_DEBUG_MEMORY_USAGE_LOG_INTERVAL_MS); new Thread( new Runnable() { @Override public void run() { try { while (!isShutDown()) { Thread.sleep(logIntervalMs); LOG.debug(getMemoryUsageStatsAsString(memoryMXBean)); LOG.debug(getGarbageCollectorStatsAsString(gcMXBeans)); } } catch (InterruptedException e) { LOG.warn("Unexpected interruption of memory usage logger thread."); } } }) .start(); } }
public static void getStratosphereConfiguration(String confDir) { GlobalConfiguration.loadConfiguration(confDir); }