Ejemplo n.º 1
0
  @Override
  protected void handleLostConnections() {
    String cluster;
    String node;
    for (SendPortIdentifier lost : masterRP.lostConnections()) {
      System.out.println("lost connection with " + lost.ibisIdentifier().location().toString());
      cluster = lost.ibisIdentifier().location().getParent().toString();
      node = lost.ibisIdentifier().location().getLevel(0);

      if (!workers.get(cluster).get(node).isFinished()) {
        for (Job j : schedJobs.values())
          if (j.getNode().compareTo(node) == 0) {
            schedJobs.remove(j.getJobID());
            /*begin hpdc tests*/
            if (j.getNode().contains("slow")) {
              j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString();
            }
            /*end hpdc tests*/
            bot.tasks.add(j);
            workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis());

            System.err.println(
                "Node "
                    + node
                    + " in cluster "
                    + cluster
                    + " failed during execution of job "
                    + j.jobID);
            break;
          }
      }
    }
  }
Ejemplo n.º 2
0
  public void run() {
    // TODO Auto-generated method stub
    timeout = (long) (BoTRunner.INITIAL_TIMEOUT_PERCENT * bot.deadline * 60000);
    System.err.println("Timeout is now " + timeout);

    /*first receive requests from all workers*/
    while (hosts.size() != maxWorkers) {
      ReadMessage rm;
      try {

        rm = masterRP.receive(timeout);

        Object received = rm.readObject();
        IbisIdentifier from = rm.origin().ibisIdentifier();
        rm.finish();

        hosts.put(from.location().toString(), new Host(from));

        String cluster = from.location().getParent().toString();

        /*DEBUG*/
        System.err.println(
            "job request from node "
                + from.location().toString()
                + " in cluster "
                + cluster
                + "; number of hosts is now "
                + hosts.size());

      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (ClassNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    /*then precompute schedule*/
    while (bot.tasks.size() != 0) {
      long mct = Long.MIN_VALUE;
      String bestHost = "";
      Job schedJob = null;
      for (Job j : bot.tasks) {
        long mctj = Long.MAX_VALUE;
        String bestHostJ = "";
        long et = Long.parseLong(j.args[0]);
        for (Host host : hosts.values()) {
          if (host.node.contains("slow")) {
            if (mctj > host.EAT + 2 * et / 3) {
              mctj = host.EAT + 2 * et / 3;
              bestHostJ = host.node;
            }
          } else {
            if (mctj > host.EAT + et) {
              mctj = host.EAT + et;
              bestHostJ = host.node;
            }
          }
        }
        if (mct < mctj) {
          mct = mctj;
          bestHost = bestHostJ;
          schedJob = j;
        }
      }
      hosts.get(bestHost).addJob(schedJob);
      schedJobs.put(schedJob.jobID, schedJob);
      bot.tasks.remove(schedJob);
      System.out.println(
          "Job "
              + schedJob.jobID
              + " with et: "
              + schedJob.args[0]
              + " was scheduled on machine "
              + bestHost
              + "; EAT is now "
              + hosts.get(bestHost).EAT);
    }

    long meat = Long.MIN_VALUE;
    for (Host host : hosts.values()) {
      if (host.EAT > meat) meat = host.EAT;
    }
    System.out.println("Longest run should be: " + meat / 60 + "m" + meat % 60 + "s");

    actualStartTime = System.currentTimeMillis();

    /*send first job to each worker*/
    for (Host host : hosts.values()) {
      /*begin for hpdc tests*/

      Job nextJob = handleJobRequest(host.from);

      nextJob.setNode(host.from.location().getLevel(0));

      if ((!(nextJob instanceof NoJob)) && (nextJob.submitted != true)) {
        long sleep = Long.parseLong(nextJob.args[0]);
        if (host.from.location().getParent().toString().compareTo("slow") == 0) {
          nextJob.args[0] = new Long(2 * sleep / 3).toString();
        }
        nextJob.submitted = true;
      }
      /*end for hpdc tests*/

      SendPort workReplyPort;
      try {
        workReplyPort = myIbis.createSendPort(masterReplyPortType);

        workReplyPort.connect(host.from, "worker");

        WriteMessage wm = workReplyPort.newMessage();
        wm.writeObject(nextJob);
        wm.finish();
        workReplyPort.close();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    boolean undone = true;

    while (undone) {
      try {

        ReadMessage rm = masterRP.receive(timeout);

        Object received = rm.readObject();
        IbisIdentifier from = rm.origin().ibisIdentifier();
        rm.finish();

        Job nextJob = null;

        if (received instanceof JobResult) {
          nextJob = handleJobResult((JobResult) received, from);
        } else {
          throw new RuntimeException("received " + "an object which is not JobResult:" + received);
        }

        nextJob.setNode(from.location().getLevel(0));

        /*begin for hpdc tests*/
        if (!(nextJob instanceof NoJob)) {
          long sleep = Long.parseLong(nextJob.args[0]);
          if (from.location().getParent().toString().compareTo("slow") == 0) {
            nextJob.args[0] = new Long(2 * sleep / 3).toString();
          }
        }
        /*end for hpdc tests*/

        SendPort workReplyPort = myIbis.createSendPort(masterReplyPortType);
        workReplyPort.connect(from, "worker");

        WriteMessage wm = workReplyPort.newMessage();
        wm.writeObject(nextJob);
        wm.finish();
        workReplyPort.close();

        undone = !areWeDone();

      } catch (ReceiveTimedOutException rtoe) {
        System.err.println("I timed out!");
        undone = !areWeDone();

      } catch (ConnectionFailedException cfe) {
        String cluster = cfe.ibisIdentifier().location().getParent().toString();
        String node = cfe.ibisIdentifier().location().getLevel(0);
        for (Job j : schedJobs.values())
          if (j.getNode().compareTo(node) == 0) {
            schedJobs.remove(j.getJobID());
            /*begin hpdc tests*/
            if (j.getNode().contains("slow")) {
              j.args[0] = new Long(3 * Long.parseLong(j.args[0]) / 2).toString();
            }
            /*end hpdc tests*/
            bot.tasks.add(j);
            workers.get(cluster).get(j.getNode()).workerFinished(System.currentTimeMillis());

            System.err.println(
                "Node "
                    + cfe.ibisIdentifier().location().toString()
                    + " failed before receiving job "
                    + j.jobID);
            break;
          }
      } catch (IOException ioe) {
        ioe.printStackTrace();
        undone = !areWeDone();
      } catch (ClassNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
  }