@Override protected void handleLostConnections() { String cluster; String node; for (SendPortIdentifier lost : masterRP.lostConnections()) { System.out.println("lost connection with " + lost.ibisIdentifier().location().toString()); cluster = lost.ibisIdentifier().location().getParent().toString(); node = lost.ibisIdentifier().location().getLevel(0); if (!workers.get(cluster).get(node).isFinished()) { for (Job j : schedJobs.values()) if (j.getNode().compareTo(node) == 0) { schedJobs.remove(j.getJobID()); /*begin hpdc tests*/ if (j.getNode().contains("slow")) { j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString(); } /*end hpdc tests*/ bot.tasks.add(j); workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis()); System.err.println( "Node " + node + " in cluster " + cluster + " failed during execution of job " + j.jobID); break; } } } }
/** * Retrieves the value associated with the provided iteration of the given optimizing job. * * @param iteration The job iteration for which to retrieve the value. * @return The value associated with the provided iteration of the given optimizing job. * @throws SLAMDException If a problem occurs while trying to determine the value for the given * optimizing job iteration. */ @Override() public double getIterationOptimizationValue(Job iteration) throws SLAMDException { StatTracker[] trackers = iteration.getStatTrackers(optimizeStat); if ((trackers == null) || (trackers.length == 0)) { throw new SLAMDException( "The provided optimizing job iteration did " + "not include any values for the statistic to " + "optimize, \"" + optimizeStat + "\"."); } StatTracker tracker = trackers[0].newInstance(); tracker.aggregate(trackers); double summaryValue = tracker.getSummaryValue(); iteration.slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm." + "getIterationOptimizationValue(" + iteration.getJobID() + ") returning " + summaryValue); return summaryValue; }
@Override protected Job handleJobResult(JobResult received, IbisIdentifier from) { // TODO Auto-generated method stub String cluster = from.location().getParent().toString(); System.err.println(from.location().toString() + " " + received.getStats().getRuntime()); /* assumes jobs don't need to be replicated on the same cluster, except on failure */ Job doneJob = schedJobs.remove(received.getJobID()); workers .get(cluster) .get(from.location().getLevel(0)) .addJobStats(received.getStats().getRuntime()); /*create category if it doesn't exist yet * upper duration since we pay in discrete increments of priced time unit*/ doneJobs.put(doneJob.getJobID(), doneJob); if (hosts.get(from.location().toString()).schedJobs.size() == 0) return sayGB(from); Job nextJob = hosts.get(from.location().toString()).schedJobs.remove(0); nextJob.startTime = System.nanoTime(); return nextJob; }
/** * Indicates whether the provided iteration is the best one seen so far for the given optimizing * job based on the constraints specified in the parameters used to initialize this optimization * algorithm. * * @param iteration The job iteration for which to make the determination. * @return <CODE>true</CODE> if the provided iteration is the best one seen so far for the * optimizing job, or <CODE>false</CODE> if not. * @throws SLAMDException If a problem occurs that prevents a valid determination from being made. * If this exception is thrown, then the optimizing job will stop immediately with no further * iterations. */ @Override() public boolean isBestIterationSoFar(Job iteration) throws SLAMDException { SLAMDServer slamdServer = iteration.slamdServer; if (!isAcceptableReplicationLatency(iteration)) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm.isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because the iteration does not have acceptable " + "replication latency data."); return false; } double iterationValue = getIterationOptimizationValue(iteration); if (Double.isNaN(bestValueSoFar) && (!Double.isNaN(iterationValue))) { bestValueSoFar = iterationValue; slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm.isBestIterationSoFar(" + iteration.getJobID() + ") returning true " + "because iteration value " + iterationValue + " is not NaN but current best is NaN."); return true; } switch (optimizeType) { case OPTIMIZE_TYPE_MAXIMIZE: if (iterationValue > bestValueSoFar) { if (iterationValue > bestValueSoFar + bestValueSoFar * minPctImprovement) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm." + "isBestIterationSoFar(" + iteration.getJobID() + ") returning true " + "because iteration value " + iterationValue + " is greater than previous best value " + bestValueSoFar + " by at least " + (minPctImprovement * 100) + "%."); bestValueSoFar = iterationValue; return true; } else { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm." + "isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because iteration value " + iterationValue + " is greater than previous best value " + bestValueSoFar + " but the margin of " + "improvement is less than " + (minPctImprovement * 100) + "%."); return false; } } else { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm.isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because iteration value " + iterationValue + " is less than previous best value " + bestValueSoFar); return false; } case OPTIMIZE_TYPE_MINIMIZE: if (iterationValue < bestValueSoFar) { if (iterationValue < bestValueSoFar - bestValueSoFar * minPctImprovement) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm." + "isBestIterationSoFar(" + iteration.getJobID() + ") returning true " + "because iteration value " + iterationValue + " is less than previous best value " + bestValueSoFar + " by at least " + (minPctImprovement * 100) + "%."); bestValueSoFar = iterationValue; return true; } else { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm." + "isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because iteration value " + iterationValue + " is less than previous best value " + bestValueSoFar + " but the margin of " + "improvement is less than " + (minPctImprovement * 100) + "%."); return false; } } else { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm.isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because iteration value " + iterationValue + " is greater than previous best value " + bestValueSoFar); return false; } default: slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithReplicationLatency" + "OptimizationAlgorithm.isBestIterationSoFar(" + iteration.getJobID() + ") returning false " + "because an unknown optimization type of " + optimizeType + " is being used."); return false; } }
/** * Indicates whether the provided job iteration has an acceptable CPU utilization. * * @param iteration The iteration for which to make the determination. * @return <CODE>true</CODE> if the CPU utilization for the provided iteration is acceptable, or * <CODE>false</CODE> if not. * @throws SLAMDException If the provided iteration does not include sufficient CPU utilization * data to make the determination. */ private boolean isAcceptableCPUUtilization(Job iteration) throws SLAMDException { SLAMDServer slamdServer = iteration.slamdServer; boolean utilizationFound = false; String className = VMStatResourceMonitor.class.getName(); ResourceMonitorStatTracker[] monitorTrackers = iteration.getResourceMonitorStatTrackersForClass(className); for (int i = 0; i < monitorTrackers.length; i++) { StatTracker tracker = monitorTrackers[i].getStatTracker(); String name = tracker.getDisplayName(); if ((tracker instanceof StackedValueTracker) && name.endsWith(VMStatResourceMonitor.STAT_TRACKER_CPU_UTILIZATION)) { utilizationFound = true; StackedValueTracker utilizationTracker = (StackedValueTracker) tracker; double userTime = utilizationTracker.getAverageValue(VMStatResourceMonitor.UTILIZATION_CATEGORY_USER); double systemTime = utilizationTracker.getAverageValue(VMStatResourceMonitor.UTILIZATION_CATEGORY_SYSTEM); double busyTime = userTime + systemTime; switch (utilizationComponent) { case UTILIZATION_COMPONENT_USER_TIME: if (userTime > maxUtilization) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilization" + "OptimizationAlgorithm.isAcceptableCPU" + "Utilization(" + iteration.getJobID() + ") returning false because user time of " + userTime + " for stat " + tracker.getDisplayName() + " exceeded the maximum allowed of " + maxUtilization); return false; } break; case UTILIZATION_COMPONENT_SYSTEM_TIME: if (systemTime > maxUtilization) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilization" + "OptimizationAlgorithm.isAcceptableCPU" + "Utilization(" + iteration.getJobID() + ") returning false because system time " + "of " + systemTime + " for stat " + tracker.getDisplayName() + " exceeded the maximum allowed of " + maxUtilization); return false; } break; case UTILIZATION_COMPONENT_BUSY_TIME: if (busyTime > maxUtilization) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilization" + "OptimizationAlgorithm.isAcceptableCPU" + "Utilization(" + iteration.getJobID() + ") returning false because busy time of " + busyTime + " for stat " + tracker.getDisplayName() + " exceeded the maximum allowed of " + maxUtilization); return false; } break; default: slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilization" + "OptimizationAlgorithm.isAcceptableCPU" + "Utilization(" + iteration.getJobID() + ") returning false because an unknown " + "utilization component of " + utilizationComponent + " is in use."); return false; } } else if ((tracker instanceof IntegerValueTracker) && ((utilizationComponent == UTILIZATION_COMPONENT_USER_TIME) && name.endsWith(VMStatResourceMonitor.STAT_TRACKER_CPU_USER)) || ((utilizationComponent == UTILIZATION_COMPONENT_SYSTEM_TIME) && name.endsWith(VMStatResourceMonitor.STAT_TRACKER_CPU_SYSTEM)) || ((utilizationComponent == UTILIZATION_COMPONENT_BUSY_TIME) && name.endsWith(VMStatResourceMonitor.STAT_TRACKER_CPU_BUSY))) { utilizationFound = true; double value = ((IntegerValueTracker) tracker).getAverageValue(); if (value > maxUtilization) { slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilization" + "OptimizationAlgorithm.isAcceptableCPU" + "Utilization(" + iteration.getJobID() + ") returning false because value of " + value + " for stat " + tracker.getDisplayName() + " exceeded the maximum allowed of " + maxUtilization); return false; } } } if (!utilizationFound) { throw new SLAMDException( "The provided job iteration did not include " + "any CPU utilization data."); } slamdServer.logMessage( Constants.LOG_LEVEL_JOB_DEBUG, "SingleStatisticWithCPUUtilizationOptimization" + "Algorithm.isAcceptableCPUUtilization(" + iteration.getJobID() + ") returning true."); return true; }
public void run() { // TODO Auto-generated method stub timeout = (long) (BoTRunner.INITIAL_TIMEOUT_PERCENT * bot.deadline * 60000); System.err.println("Timeout is now " + timeout); /*first receive requests from all workers*/ while (hosts.size() != maxWorkers) { ReadMessage rm; try { rm = masterRP.receive(timeout); Object received = rm.readObject(); IbisIdentifier from = rm.origin().ibisIdentifier(); rm.finish(); hosts.put(from.location().toString(), new Host(from)); String cluster = from.location().getParent().toString(); /*DEBUG*/ System.err.println( "job request from node " + from.location().toString() + " in cluster " + cluster + "; number of hosts is now " + hosts.size()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /*then precompute schedule*/ while (bot.tasks.size() != 0) { long mct = Long.MIN_VALUE; String bestHost = ""; Job schedJob = null; for (Job j : bot.tasks) { long mctj = Long.MAX_VALUE; String bestHostJ = ""; long et = Long.parseLong(j.args[0]); for (Host host : hosts.values()) { if (host.node.contains("slow")) { if (mctj > host.EAT + 2 * et / 3) { mctj = host.EAT + 2 * et / 3; bestHostJ = host.node; } } else { if (mctj > host.EAT + et) { mctj = host.EAT + et; bestHostJ = host.node; } } } if (mct < mctj) { mct = mctj; bestHost = bestHostJ; schedJob = j; } } hosts.get(bestHost).addJob(schedJob); schedJobs.put(schedJob.jobID, schedJob); bot.tasks.remove(schedJob); System.out.println( "Job " + schedJob.jobID + " with et: " + schedJob.args[0] + " was scheduled on machine " + bestHost + "; EAT is now " + hosts.get(bestHost).EAT); } long meat = Long.MIN_VALUE; for (Host host : hosts.values()) { if (host.EAT > meat) meat = host.EAT; } System.out.println("Longest run should be: " + meat / 60 + "m" + meat % 60 + "s"); actualStartTime = System.currentTimeMillis(); /*send first job to each worker*/ for (Host host : hosts.values()) { /*begin for hpdc tests*/ Job nextJob = handleJobRequest(host.from); nextJob.setNode(host.from.location().getLevel(0)); if ((!(nextJob instanceof NoJob)) && (nextJob.submitted != true)) { long sleep = Long.parseLong(nextJob.args[0]); if (host.from.location().getParent().toString().compareTo("slow") == 0) { nextJob.args[0] = new Long(2 * sleep / 3).toString(); } nextJob.submitted = true; } /*end for hpdc tests*/ SendPort workReplyPort; try { workReplyPort = myIbis.createSendPort(masterReplyPortType); workReplyPort.connect(host.from, "worker"); WriteMessage wm = workReplyPort.newMessage(); wm.writeObject(nextJob); wm.finish(); workReplyPort.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } boolean undone = true; while (undone) { try { ReadMessage rm = masterRP.receive(timeout); Object received = rm.readObject(); IbisIdentifier from = rm.origin().ibisIdentifier(); rm.finish(); Job nextJob = null; if (received instanceof JobResult) { nextJob = handleJobResult((JobResult) received, from); } else { throw new RuntimeException("received " + "an object which is not JobResult:" + received); } nextJob.setNode(from.location().getLevel(0)); /*begin for hpdc tests*/ if (!(nextJob instanceof NoJob)) { long sleep = Long.parseLong(nextJob.args[0]); if (from.location().getParent().toString().compareTo("slow") == 0) { nextJob.args[0] = new Long(2 * sleep / 3).toString(); } } /*end for hpdc tests*/ SendPort workReplyPort = myIbis.createSendPort(masterReplyPortType); workReplyPort.connect(from, "worker"); WriteMessage wm = workReplyPort.newMessage(); wm.writeObject(nextJob); wm.finish(); workReplyPort.close(); undone = !areWeDone(); } catch (ReceiveTimedOutException rtoe) { System.err.println("I timed out!"); undone = !areWeDone(); } catch (ConnectionFailedException cfe) { String cluster = cfe.ibisIdentifier().location().getParent().toString(); String node = cfe.ibisIdentifier().location().getLevel(0); for (Job j : schedJobs.values()) if (j.getNode().compareTo(node) == 0) { schedJobs.remove(j.getJobID()); /*begin hpdc tests*/ if (j.getNode().contains("slow")) { j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString(); } /*end hpdc tests*/ bot.tasks.add(j); workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis()); System.err.println( "Node " + cfe.ibisIdentifier().location().toString() + " failed before receiving job " + j.jobID); break; } } catch (IOException ioe) { ioe.printStackTrace(); undone = !areWeDone(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
/** * Receive a job and add it to the master JobList if its data is valid. * * @param theJob the Job to potentially add to Schedule's List of Jobs. * @return true if theJob was added; false otherwise. */ public boolean receiveJob(Job theJob) throws IllegalArgumentException { /* * We subject both the Job and JobList to a wide variety of tests. * If all of the tests are passed, then we add the Job to the JobList. */ boolean addFlag = true; if (theJob == null) addFlag = false; // Check to ensure that the total number of pending jobs is less than 30. else if (!(new BusinessRule1().test(myJobList))) { throw new IllegalArgumentException( "Sorry, but the limit of 30 pending jobs has already been reached."); } /* * Check to ensure that the total number of Jobs for the week that this Job is to occur is * less than 5. * A week is defined as 3 days before a Job's Start Date and 3 days after its End Date. */ else if (!(new BusinessRule2().test(theJob, myJobList))) { throw new IllegalArgumentException( "Sorry, but the limit of 5 jobs has already been reached for the week that " + "this job was scheduled."); } // Check that the Job is not scheduled to last longer than two days. else if (!(new BusinessRule4().test(theJob))) { throw new IllegalArgumentException("Sorry, but a job cannot last any longer than two days."); } // Check that a Job is scheduled to begin after the current date. else if (!(new BusinessRule5().pastTest(theJob))) { throw new IllegalArgumentException( "Sorry but the date you entered for this " + "job has already passed."); } // Check that a Job is scheduled to begin within the next three months. else if (!(new BusinessRule5().futureTest(theJob))) { throw new IllegalArgumentException( "Sorry but the date you entered is too far " + "into the future. \nAll jobs must be scheduled within the next 3 months."); } // Check that the Start Date and End Date are not swapped. else if (theJob.getStartDate().after(theJob.getEndDate())) { throw new IllegalArgumentException( "Sorry, but the Start Date must come before or on the same date as the End Date."); } // Check that the Job ID is valid. else if (theJob.getJobID() < 0 || theJob.getJobID() > Job.MAX_NUM_JOBS) { throw new IllegalArgumentException("Error: Invalid Job ID. Please logout and try again."); } // Check that the Volunteer List is not null. else if (theJob.getVolunteerList() == null) { throw new IllegalArgumentException( "Error: Null Volunteer List. Please logout and try again."); } // Check that the Volunteer List is empty. else if (!theJob.getVolunteerList().isEmpty()) { throw new IllegalArgumentException( "Error: Non-empty Volunteer List. Please logout and try again."); } // Check that there is at least one slot available for a Volunteer to sign up for. else if (!theJob.hasLightRoom() && !theJob.hasMediumRoom() && !theJob.hasHeavyRoom()) { throw new IllegalArgumentException( "Sorry, but a slot in at least one Volunteer Grade must be available."); } // Check that none of the slots are set to negative numbers. else if (theJob.getLightCurrent() > theJob.getLightMax() || theJob.getMediumCurrent() > theJob.getMediumMax() || theJob.getHeavyCurrent() > theJob.getHeavyMax()) { throw new IllegalArgumentException( "Sorry, but the number of slots for a Volunteer Grade cannot be negative."); } // Check that the Park for the Job is not null. else if (theJob.getPark() == null) { throw new IllegalArgumentException("Error: Null Park. Please logout and try again."); } // Check that the ParkManager for the Job is not null. else if (theJob.getManager() == null) { throw new IllegalArgumentException("Error: Null ParkManager. Please logout and try again."); } // If all tests passed, then we add the Job to the Schedule. if (addFlag) { // To get the master job list which is editable List<Job> editableJobList = myJobList.getJobList(); editableJobList.add(theJob); // add valid job to list } else { // If we somehow got here without throwing an exception, and the Job is not valid, then we // throw a general exception. throw new IllegalArgumentException( "Error: Job data is invalid for unknown reasons. Please logout and try again."); } return addFlag; }