/** * Take action when the job timer times out. * * @exception IOException Thrown if an I/O error occurred. */ private void jobTimeout() throws IOException { boolean doExit = false; synchronized (this) { if (myJobTimer.isTriggered()) { continueRun = false; if (myState == State.RUNNING) { myState = State.TERMINATE_CANCEL_JOB; myCancelMessage = "Job exceeded maximum running time"; System.err.println(myCancelMessage); doExit = true; } } } // Cannot hold the synchronization lock while calling System.exit(), // otherwise a deadlock can occur between this thread (the timer thread) // and the shutdown hook thread. if (doExit) System.exit(1); }
/** Shut down this Job Frontend. */ private void shutdown() { synchronized (this) { // Stop all lease timers. mySchedulerRenewTimer.stop(); mySchedulerExpireTimer.stop(); for (ProcessInfo processinfo : myProcessInfo) { processinfo.renewTimer.stop(); processinfo.expireTimer.stop(); } // If state is RUNNING but myCancelMessage is not null, it means the // user canceled the job (e.g., by hitting CTRL-C). if (myState == State.RUNNING && myCancelMessage != null) { myState = State.TERMINATE_CANCEL_JOB; } // Inform Job Scheduler and Job Backends. switch (myState) { case RUNNING: // Send "job finished" messages. for (ProcessInfo processinfo : myProcessInfo) { if (processinfo.backend != null) { try { processinfo.backend.jobFinished(this); } catch (IOException exc) { } } } if (myJobScheduler != null) { try { myJobScheduler.jobFinished(this); } catch (IOException exc) { } } break; case TERMINATE_CANCEL_JOB: // Send "cancel job" messages. for (ProcessInfo processinfo : myProcessInfo) { if (processinfo.backend != null && processinfo.state != ProcessInfo.State.FAILED) { try { processinfo.backend.cancelJob(this, myCancelMessage); } catch (IOException exc) { } } } if (myJobScheduler != null) { try { myJobScheduler.cancelJob(this, myCancelMessage); } catch (IOException exc) { } } break; case TERMINATING: // Send nothing. break; } // Record that we are terminating. myState = State.TERMINATING; } // All proxies, channels, and channel groups will close when the process // exits. }
/** * Take action when the Job Scheduler's lease renewal timer times out. * * @exception IOException Thrown if an I/O error occurred. */ private synchronized void schedulerRenewTimeout() throws IOException { if (mySchedulerRenewTimer.isTriggered()) { myJobScheduler.renewLease(this); } }
/** * Report that a backend process is ready to commence executing the job. * * @param theJobBackend Job Backend that is calling this method. * @param rank Rank of the job backend process. * @param middlewareAddress Host/port to which the job backend process is listening for middleware * messages. * @param worldAddress Host/port to which the job backend process is listening for the world * communicator. * @param frontendAddress Host/port to which the job backend process is listening for the frontend * communicator, or null if the frontend communicator does not exist. * @exception IOException Thrown if an I/O error occurred. */ public synchronized void backendReady( JobBackendRef theJobBackend, int rank, InetSocketAddress middlewareAddress, InetSocketAddress worldAddress, InetSocketAddress frontendAddress) throws IOException { // Verify that rank is in range. if (0 > rank || rank >= Np) { terminateCancelJob("Illegal \"backend ready\" message, rank=" + rank); } // Verify that this backend has not started already. ProcessInfo processinfo = myProcessInfo[rank]; if (processinfo.state != ProcessInfo.State.NOT_STARTED) { terminateCancelJob("Unexpected \"backend ready\" message, rank=" + rank); } // Record information in job backend process info record. processinfo.state = ProcessInfo.State.RUNNING; processinfo.backend = theJobBackend; processinfo.middlewareAddress = middlewareAddress; processinfo.worldAddress = worldAddress; processinfo.frontendAddress = frontendAddress; myProcessMap.put(theJobBackend, processinfo); // Record channel group addresses. myMiddlewareAddress[rank] = middlewareAddress; myWorldAddress[rank] = worldAddress; if (hasFrontendComm) { myFrontendAddress[rank] = frontendAddress; } // Increase count of running processes. ++myRunningCount; // If all job backend processes have reported ready, commence job. if (myRunningCount == Np) { // Start job timer if necessary. int jobtime = PJProperties.getPjJobTime(); if (jobtime > 0) { myJobTimer.start(jobtime * 1000L); } // Get the system properties. Properties props = System.getProperties(); // Send "commence job" message to each job backend, with system // property "pj.nt" set to the proper number of CPUs. for (ProcessInfo info : myProcessMap.values()) { props.setProperty("pj.nt", "" + info.Nt); info.backend.commenceJob( /*theJobFrontend */ this, /*middlewareAddress*/ myMiddlewareAddress, /*worldAddress */ myWorldAddress, /*frontendAddress */ myFrontendAddress, /*properties */ props, /*mainClassName */ myMainClassName, /*args */ myArgs); } } }
/** * Renew the lease on the job. * * @param theJobScheduler Job Scheduler that is calling this method. * @exception IOException Thrown if an I/O error occurred. */ public synchronized void renewLease(JobSchedulerRef theJobScheduler) throws IOException { mySchedulerExpireTimer.start(Constants.LEASE_EXPIRE_INTERVAL); }
/** * Construct a new job frontend object. The job frontend object will contact the Job Scheduler * Daemon specified by the <TT>"pj.host"</TT> and <TT>"pj.port"</TT> Java system properties. See * class {@linkplain benchmarks.detinfer.pj.edu.ritpj.PJProperties} for further information. * * @param username User name. * @param Nn Number of backend nodes (>= 1). * @param Np Number of processes (>= 1). * @param Nt Number of CPUs per process (>= 0). 0 means "all CPUs." * @param hasFrontendComm True if the job has the frontend communicator, false if it doesn't. * @param mainClassName Main class name. * @param args Command line arguments. * @exception JobSchedulerException (subclass of IOException) Thrown if the job frontend object * could not contact the Job Scheduler Daemon. * @exception IOException Thrown if an I/O error occurred. */ public JobFrontend( String username, int Nn, int Np, int Nt, boolean hasFrontendComm, String mainClassName, String[] args) throws IOException { // Record arguments. this.username = username; this.Nn = Nn; this.Np = Np; this.Nt = Nt; this.hasFrontendComm = hasFrontendComm; this.myMainClassName = mainClassName; this.myArgs = args; // Set up shutdown hook. Runtime.getRuntime() .addShutdownHook( new Thread() { public void run() { shutdown(); } }); // Set up lease timer thread. myLeaseTimerThread = new TimerThread(); myLeaseTimerThread.setDaemon(true); myLeaseTimerThread.start(); // Set up Job Scheduler lease timers. mySchedulerRenewTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { schedulerRenewTimeout(); } catch (Throwable exc) { } } }); mySchedulerExpireTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { schedulerExpireTimeout(); } catch (Throwable exc) { } } }); // Set up job timer. myJobTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { jobTimeout(); } catch (Throwable exc) { } } }); // Set up array of job backend process info records. myProcessInfo = new ProcessInfo[Np]; for (int i = 0; i < Np; ++i) { final int rank = i; ProcessInfo processinfo = new ProcessInfo( /*state */ ProcessInfo.State.NOT_STARTED, /*name */ null, /*rank */ rank, /*backend */ null, /*middlewareAddress*/ null, /*worldAddress */ null, /*frontendAddress */ null, /*renewTimer */ myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { backendRenewTimeout(rank); } catch (Throwable exc) { } } }), /*expireTimer */ myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { backendExpireTimeout(rank); } catch (Throwable exc) { } } }), /*Nt */ 0); myProcessInfo[rank] = processinfo; } // Set up middleware channel group and address array. myMiddlewareChannelGroup = new ChannelGroup(); myMiddlewareAddress = new InetSocketAddress[Np + 1]; // Set up world communicator address array. myWorldAddress = new InetSocketAddress[Np]; // Set up frontend communicator channel group and address array. if (hasFrontendComm) { myFrontendChannelGroup = new ChannelGroup(); myFrontendAddress = new InetSocketAddress[Np + 1]; } // Set up frontend file writer and reader. myFrontendFileWriter = new FrontendFileWriter(this); myFrontendFileReader = new FrontendFileReader(this); // Set up Job Scheduler proxy. InetSocketAddress js_address = null; Channel js_channel = null; try { js_address = new InetSocketAddress(PJProperties.getPjHost(), PJProperties.getPjPort()); js_channel = myMiddlewareChannelGroup.connect(js_address); } catch (IOException exc) { throw new JobSchedulerException( "JobFrontend(): Cannot contact Job Scheduler Daemon at " + js_address, exc); } myJobScheduler = new JobSchedulerProxy(myMiddlewareChannelGroup, js_channel); // Start Job Scheduler lease timers. mySchedulerRenewTimer.start(Constants.LEASE_RENEW_INTERVAL, Constants.LEASE_RENEW_INTERVAL); mySchedulerExpireTimer.start(Constants.LEASE_EXPIRE_INTERVAL); // Kick off the job! myJobScheduler.requestJob(this, username, Nn, Np, Nt); }