/** * Take action when a backend process fails. * * @param processinfo Process info. * @return Error message. */ private String backendFailed(ProcessInfo processinfo) { // Mark the backend process as failed. processinfo.state = ProcessInfo.State.FAILED; // Tell the Job Scheduler that the backend process failed. try { myJobScheduler.backendFailed(this, processinfo.name); } catch (IOException exc) { } // Set up error message. return "Job backend process failed, node " + processinfo.name + ", rank " + processinfo.rank; }
/** Shut down this Job Frontend. */ private void shutdown() { synchronized (this) { // Stop all lease timers. mySchedulerRenewTimer.stop(); mySchedulerExpireTimer.stop(); for (ProcessInfo processinfo : myProcessInfo) { processinfo.renewTimer.stop(); processinfo.expireTimer.stop(); } // If state is RUNNING but myCancelMessage is not null, it means the // user canceled the job (e.g., by hitting CTRL-C). if (myState == State.RUNNING && myCancelMessage != null) { myState = State.TERMINATE_CANCEL_JOB; } // Inform Job Scheduler and Job Backends. switch (myState) { case RUNNING: // Send "job finished" messages. for (ProcessInfo processinfo : myProcessInfo) { if (processinfo.backend != null) { try { processinfo.backend.jobFinished(this); } catch (IOException exc) { } } } if (myJobScheduler != null) { try { myJobScheduler.jobFinished(this); } catch (IOException exc) { } } break; case TERMINATE_CANCEL_JOB: // Send "cancel job" messages. for (ProcessInfo processinfo : myProcessInfo) { if (processinfo.backend != null && processinfo.state != ProcessInfo.State.FAILED) { try { processinfo.backend.cancelJob(this, myCancelMessage); } catch (IOException exc) { } } } if (myJobScheduler != null) { try { myJobScheduler.cancelJob(this, myCancelMessage); } catch (IOException exc) { } } break; case TERMINATING: // Send nothing. break; } // Record that we are terminating. myState = State.TERMINATING; } // All proxies, channels, and channel groups will close when the process // exits. }
/** * Take action when the Job Scheduler's lease renewal timer times out. * * @exception IOException Thrown if an I/O error occurred. */ private synchronized void schedulerRenewTimeout() throws IOException { if (mySchedulerRenewTimer.isTriggered()) { myJobScheduler.renewLease(this); } }
/** * Construct a new job frontend object. The job frontend object will contact the Job Scheduler * Daemon specified by the <TT>"pj.host"</TT> and <TT>"pj.port"</TT> Java system properties. See * class {@linkplain benchmarks.detinfer.pj.edu.ritpj.PJProperties} for further information. * * @param username User name. * @param Nn Number of backend nodes (>= 1). * @param Np Number of processes (>= 1). * @param Nt Number of CPUs per process (>= 0). 0 means "all CPUs." * @param hasFrontendComm True if the job has the frontend communicator, false if it doesn't. * @param mainClassName Main class name. * @param args Command line arguments. * @exception JobSchedulerException (subclass of IOException) Thrown if the job frontend object * could not contact the Job Scheduler Daemon. * @exception IOException Thrown if an I/O error occurred. */ public JobFrontend( String username, int Nn, int Np, int Nt, boolean hasFrontendComm, String mainClassName, String[] args) throws IOException { // Record arguments. this.username = username; this.Nn = Nn; this.Np = Np; this.Nt = Nt; this.hasFrontendComm = hasFrontendComm; this.myMainClassName = mainClassName; this.myArgs = args; // Set up shutdown hook. Runtime.getRuntime() .addShutdownHook( new Thread() { public void run() { shutdown(); } }); // Set up lease timer thread. myLeaseTimerThread = new TimerThread(); myLeaseTimerThread.setDaemon(true); myLeaseTimerThread.start(); // Set up Job Scheduler lease timers. mySchedulerRenewTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { schedulerRenewTimeout(); } catch (Throwable exc) { } } }); mySchedulerExpireTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { schedulerExpireTimeout(); } catch (Throwable exc) { } } }); // Set up job timer. myJobTimer = myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { jobTimeout(); } catch (Throwable exc) { } } }); // Set up array of job backend process info records. myProcessInfo = new ProcessInfo[Np]; for (int i = 0; i < Np; ++i) { final int rank = i; ProcessInfo processinfo = new ProcessInfo( /*state */ ProcessInfo.State.NOT_STARTED, /*name */ null, /*rank */ rank, /*backend */ null, /*middlewareAddress*/ null, /*worldAddress */ null, /*frontendAddress */ null, /*renewTimer */ myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { backendRenewTimeout(rank); } catch (Throwable exc) { } } }), /*expireTimer */ myLeaseTimerThread.createTimer( new TimerTask() { public void action(Timer timer) { try { backendExpireTimeout(rank); } catch (Throwable exc) { } } }), /*Nt */ 0); myProcessInfo[rank] = processinfo; } // Set up middleware channel group and address array. myMiddlewareChannelGroup = new ChannelGroup(); myMiddlewareAddress = new InetSocketAddress[Np + 1]; // Set up world communicator address array. myWorldAddress = new InetSocketAddress[Np]; // Set up frontend communicator channel group and address array. if (hasFrontendComm) { myFrontendChannelGroup = new ChannelGroup(); myFrontendAddress = new InetSocketAddress[Np + 1]; } // Set up frontend file writer and reader. myFrontendFileWriter = new FrontendFileWriter(this); myFrontendFileReader = new FrontendFileReader(this); // Set up Job Scheduler proxy. InetSocketAddress js_address = null; Channel js_channel = null; try { js_address = new InetSocketAddress(PJProperties.getPjHost(), PJProperties.getPjPort()); js_channel = myMiddlewareChannelGroup.connect(js_address); } catch (IOException exc) { throw new JobSchedulerException( "JobFrontend(): Cannot contact Job Scheduler Daemon at " + js_address, exc); } myJobScheduler = new JobSchedulerProxy(myMiddlewareChannelGroup, js_channel); // Start Job Scheduler lease timers. mySchedulerRenewTimer.start(Constants.LEASE_RENEW_INTERVAL, Constants.LEASE_RENEW_INTERVAL); mySchedulerExpireTimer.start(Constants.LEASE_EXPIRE_INTERVAL); // Kick off the job! myJobScheduler.requestJob(this, username, Nn, Np, Nt); }