@Override protected void handleLostConnections() { String cluster; String node; for (SendPortIdentifier lost : masterRP.lostConnections()) { System.out.println("lost connection with " + lost.ibisIdentifier().location().toString()); cluster = lost.ibisIdentifier().location().getParent().toString(); node = lost.ibisIdentifier().location().getLevel(0); if (!workers.get(cluster).get(node).isFinished()) { for (Job j : schedJobs.values()) if (j.getNode().compareTo(node) == 0) { schedJobs.remove(j.getJobID()); /*begin hpdc tests*/ if (j.getNode().contains("slow")) { j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString(); } /*end hpdc tests*/ bot.tasks.add(j); workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis()); System.err.println( "Node " + node + " in cluster " + cluster + " failed during execution of job " + j.jobID); break; } } } }
public void run() { // TODO Auto-generated method stub timeout = (long) (BoTRunner.INITIAL_TIMEOUT_PERCENT * bot.deadline * 60000); System.err.println("Timeout is now " + timeout); /*first receive requests from all workers*/ while (hosts.size() != maxWorkers) { ReadMessage rm; try { rm = masterRP.receive(timeout); Object received = rm.readObject(); IbisIdentifier from = rm.origin().ibisIdentifier(); rm.finish(); hosts.put(from.location().toString(), new Host(from)); String cluster = from.location().getParent().toString(); /*DEBUG*/ System.err.println( "job request from node " + from.location().toString() + " in cluster " + cluster + "; number of hosts is now " + hosts.size()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /*then precompute schedule*/ while (bot.tasks.size() != 0) { long mct = Long.MIN_VALUE; String bestHost = ""; Job schedJob = null; for (Job j : bot.tasks) { long mctj = Long.MAX_VALUE; String bestHostJ = ""; long et = Long.parseLong(j.args[0]); for (Host host : hosts.values()) { if (host.node.contains("slow")) { if (mctj > host.EAT + 2 * et / 3) { mctj = host.EAT + 2 * et / 3; bestHostJ = host.node; } } else { if (mctj > host.EAT + et) { mctj = host.EAT + et; bestHostJ = host.node; } } } if (mct < mctj) { mct = mctj; bestHost = bestHostJ; schedJob = j; } } hosts.get(bestHost).addJob(schedJob); schedJobs.put(schedJob.jobID, schedJob); bot.tasks.remove(schedJob); System.out.println( "Job " + schedJob.jobID + " with et: " + schedJob.args[0] + " was scheduled on machine " + bestHost + "; EAT is now " + hosts.get(bestHost).EAT); } long meat = Long.MIN_VALUE; for (Host host : hosts.values()) { if (host.EAT > meat) meat = host.EAT; } System.out.println("Longest run should be: " + meat / 60 + "m" + meat % 60 + "s"); actualStartTime = System.currentTimeMillis(); /*send first job to each worker*/ for (Host host : hosts.values()) { /*begin for hpdc tests*/ Job nextJob = handleJobRequest(host.from); nextJob.setNode(host.from.location().getLevel(0)); if ((!(nextJob instanceof NoJob)) && (nextJob.submitted != true)) { long sleep = Long.parseLong(nextJob.args[0]); if (host.from.location().getParent().toString().compareTo("slow") == 0) { nextJob.args[0] = new Long(2 * sleep / 3).toString(); } nextJob.submitted = true; } /*end for hpdc tests*/ SendPort workReplyPort; try { workReplyPort = myIbis.createSendPort(masterReplyPortType); workReplyPort.connect(host.from, "worker"); WriteMessage wm = workReplyPort.newMessage(); wm.writeObject(nextJob); wm.finish(); workReplyPort.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } boolean undone = true; while (undone) { try { ReadMessage rm = masterRP.receive(timeout); Object received = rm.readObject(); IbisIdentifier from = rm.origin().ibisIdentifier(); rm.finish(); Job nextJob = null; if (received instanceof JobResult) { nextJob = handleJobResult((JobResult) received, from); } else { throw new RuntimeException("received " + "an object which is not JobResult:" + received); } nextJob.setNode(from.location().getLevel(0)); /*begin for hpdc tests*/ if (!(nextJob instanceof NoJob)) { long sleep = Long.parseLong(nextJob.args[0]); if (from.location().getParent().toString().compareTo("slow") == 0) { nextJob.args[0] = new Long(2 * sleep / 3).toString(); } } /*end for hpdc tests*/ SendPort workReplyPort = myIbis.createSendPort(masterReplyPortType); workReplyPort.connect(from, "worker"); WriteMessage wm = workReplyPort.newMessage(); wm.writeObject(nextJob); wm.finish(); workReplyPort.close(); undone = !areWeDone(); } catch (ReceiveTimedOutException rtoe) { System.err.println("I timed out!"); undone = !areWeDone(); } catch (ConnectionFailedException cfe) { String cluster = cfe.ibisIdentifier().location().getParent().toString(); String node = cfe.ibisIdentifier().location().getLevel(0); for (Job j : schedJobs.values()) if (j.getNode().compareTo(node) == 0) { schedJobs.remove(j.getJobID()); /*begin hpdc tests*/ if (j.getNode().contains("slow")) { j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString(); } /*end hpdc tests*/ bot.tasks.add(j); workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis()); System.err.println( "Node " + cfe.ibisIdentifier().location().toString() + " failed before receiving job " + j.jobID); break; } } catch (IOException ioe) { ioe.printStackTrace(); undone = !areWeDone(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }