/** * JobTracker.submitJob() kicks off a new job. * * <p>Create a 'JobInProgress' object, which contains both JobProfile and JobStatus. Those two * sub-objects are sometimes shipped outside of the JobTracker. But JobInProgress adds info that's * useful for the JobTracker alone. * * <p>We add the JIP to the jobInitQueue, which is processed asynchronously to handle * split-computation and build up the right TaskTracker/Block mapping. */ public synchronized JobStatus submitJob(String jobFile) throws IOException { totalSubmissions++; JobInProgress job = new JobInProgress(jobFile, this, this.conf); synchronized (jobs) { synchronized (jobsByArrival) { synchronized (jobInitQueue) { jobs.put(job.getProfile().getJobId(), job); jobsByArrival.add(job); jobInitQueue.add(job); jobInitQueue.notifyAll(); } } } return job.getStatus(); }
private static Vector<String> tokenizeDoc(String cur_doc) { String[] words = cur_doc.split("\\s+"); Vector<String> tokens = new Vector<String>(); for (int i = 0; i < words.length; i++) { words[i] = words[i].replaceAll("\\W", ""); if (words[i].length() > 0) { tokens.add(words[i]); } } return tokens; }
public Vector completedJobs() { Vector v = new Vector(); for (Iterator it = jobs.values().iterator(); it.hasNext(); ) { JobInProgress jip = (JobInProgress) it.next(); JobStatus status = jip.getStatus(); if (status.getRunState() == JobStatus.SUCCEEDED) { v.add(jip); } } return v; }
public Vector runningJobs() { Vector v = new Vector(); for (Iterator it = jobs.values().iterator(); it.hasNext(); ) { JobInProgress jip = (JobInProgress) it.next(); JobStatus status = jip.getStatus(); if (status.getRunState() == JobStatus.RUNNING) { v.add(jip); } } return v; }
public synchronized TaskReport[] getReduceTaskReports(String jobid) { JobInProgress job = (JobInProgress) jobs.get(jobid); if (job == null) { return new TaskReport[0]; } else { Vector reports = new Vector(); Vector completeReduceTasks = job.reportTasksInProgress(false, true); for (Iterator it = completeReduceTasks.iterator(); it.hasNext(); ) { TaskInProgress tip = (TaskInProgress) it.next(); reports.add(tip.generateSingleReport()); } Vector incompleteReduceTasks = job.reportTasksInProgress(false, false); for (Iterator it = incompleteReduceTasks.iterator(); it.hasNext(); ) { TaskInProgress tip = (TaskInProgress) it.next(); reports.add(tip.generateSingleReport()); } return (TaskReport[]) reports.toArray(new TaskReport[reports.size()]); } }
// Information needed to get a single file: // BASE_PATH, FILE_ID, TIMESTAMP_START, TIMESTAMP_STOP, SOURCE, FILESYSTEM private static Vector<Path> getFile( FileSystem fs, Hashtable<String, String> config, dbutil db_util) throws Exception { Long latestVersion = latestVersion(config, db_util); try { config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); } catch (Exception E) { logger.error("Tryign to get file that is impossible to generate: " + getFullPath(config)); return null; } if (Integer.parseInt(config.get("timestamp_start")) > Integer.parseInt(config.get("timestamp_stop"))) { return null; } logger.debug( "Getting DB for timestamp " + config.get("timestamp_start") + " to " + config.get("timestamp_stop")); String final_result = getFullPath(config); String temp_path_base = config.get("local_temp_path") + "_" + config.get("task_id") + "_" + config.get("run_id") + "/"; Path newPath = new Path(final_result + "*"); Vector<Path> ret_path = new Vector<Path>(); String lockName = lock(final_result.replaceAll("/", "_")); if (fs.globStatus(newPath).length != 0) { ret_path.add(newPath); unlock(lockName); config.put("full_file_name", final_result); return ret_path; } else { if (!config.get("source").equals("local")) { config.put("temp_path_base", temp_path_base); config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); Class<?> sourceClass = Class.forName("org.gestore.plugin.source." + config.get("source") + "Source"); Method process_data = sourceClass.getMethod("process", Hashtable.class, FileSystem.class); Object processor = sourceClass.newInstance(); Object retVal; try { retVal = process_data.invoke(processor, config, fs); } catch (InvocationTargetException E) { Throwable exception = E.getTargetException(); logger.error("Unable to call method in child class: " + exception.toString()); exception.printStackTrace(System.out); unlock(lockName); return null; } FileStatus[] files = (FileStatus[]) retVal; if (files == null) { logger.error("Error getting files, no files returned"); return null; } for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(temp_path_base + config.get("file_id")); String suffix = getSuffix(config.get("file_id"), cur_file.getName()); cur_local_path = cur_local_path.suffix(suffix); Path res_path = new Path(new String(final_result + suffix)); logger.debug("Moving file" + cur_file.toString() + " to " + res_path.toString()); if (config.get("copy").equals("true")) { fs.moveFromLocalFile(cur_file, res_path); } else { fs.rename(cur_file, res_path); } } config.put("full_file_name", final_result); } } unlock(lockName); return ret_path; }
/** * A tracker wants to know if there's a Task to run. Returns a task we'd like the TaskTracker to * execute right now. * * <p>Eventually this function should compute load on the various TaskTrackers, and incorporate * knowledge of DFS file placement. But for right now, it just grabs a single item out of the * pending task list and hands it back. */ public synchronized Task pollForNewTask(String taskTracker) { // // Compute average map and reduce task numbers across pool // int avgMaps = 0; int avgReduces = 0; int numTaskTrackers; TaskTrackerStatus tts; synchronized (taskTrackers) { numTaskTrackers = taskTrackers.size(); tts = (TaskTrackerStatus) taskTrackers.get(taskTracker); } if (numTaskTrackers > 0) { avgMaps = totalMaps / numTaskTrackers; avgReduces = totalReduces / numTaskTrackers; } int totalCapacity = numTaskTrackers * maxCurrentTasks; // // Get map + reduce counts for the current tracker. // if (tts == null) { LOG.warning("Unknown task tracker polling; ignoring: " + taskTracker); return null; } int numMaps = tts.countMapTasks(); int numReduces = tts.countReduceTasks(); // // In the below steps, we allocate first a map task (if appropriate), // and then a reduce task if appropriate. We go through all jobs // in order of job arrival; jobs only get serviced if their // predecessors are serviced, too. // // // We hand a task to the current taskTracker if the given machine // has a workload that's equal to or less than the averageMaps // +/- TASK_ALLOC_EPSILON. (That epsilon is in place in case // there is an odd machine that is failing for some reason but // has not yet been removed from the pool, making capacity seem // larger than it really is.) // synchronized (jobsByArrival) { if ((numMaps < maxCurrentTasks) && (numMaps <= (avgMaps + TASK_ALLOC_EPSILON))) { int totalNeededMaps = 0; for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) { JobInProgress job = (JobInProgress) it.next(); if (job.getStatus().getRunState() != JobStatus.RUNNING) { continue; } Task t = job.obtainNewMapTask(taskTracker, tts); if (t != null) { return t; } // // Beyond the highest-priority task, reserve a little // room for failures and speculative executions; don't // schedule tasks to the hilt. // totalNeededMaps += job.desiredMaps(); double padding = 0; if (totalCapacity > MIN_SLOTS_FOR_PADDING) { padding = Math.min(maxCurrentTasks, totalNeededMaps * PAD_FRACTION); } if (totalNeededMaps + padding >= totalCapacity) { break; } } } // // Same thing, but for reduce tasks // if ((numReduces < maxCurrentTasks) && (numReduces <= (avgReduces + TASK_ALLOC_EPSILON))) { int totalNeededReduces = 0; for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) { JobInProgress job = (JobInProgress) it.next(); if (job.getStatus().getRunState() != JobStatus.RUNNING) { continue; } Task t = job.obtainNewReduceTask(taskTracker, tts); if (t != null) { return t; } // // Beyond the highest-priority task, reserve a little // room for failures and speculative executions; don't // schedule tasks to the hilt. // totalNeededReduces += job.desiredReduces(); double padding = 0; if (totalCapacity > MIN_SLOTS_FOR_PADDING) { padding = Math.min(maxCurrentTasks, totalNeededReduces * PAD_FRACTION); } if (totalNeededReduces + padding >= totalCapacity) { break; } } } } return null; }