@Override public void moveToFinetune() { log.info("Moving to finetune"); isPretrain.set(false); // new phase: resets the counter numBatches.set(0); }
@Override public boolean addJobToCurrent(Job j) throws Exception { IAtomicReference<Job> r = h.getAtomicReference("job-" + j.getWorkerId()); if (r.get() != null || !r.isNull()) { boolean sent = false; while (!sent) { // always update for (String s : workers()) { if (jobFor(s) == null) { log.info( "Redirecting worker " + j.getWorkerId() + " to " + s + " due to work already being allocated"); r = h.getAtomicReference("job-" + s); j.setWorkerId(s); sent = true; } } } } r.set(j); // iterate over jobs without the work/data j.setWork(null); jobs.add(j); return true; }
@Test public void testHazelcastInstances() { assertNotNull(map1); assertNotNull(map2); assertNotNull(multiMap); assertNotNull(queue); assertNotNull(topic); assertNotNull(set); assertNotNull(list); assertNotNull(executorService); assertNotNull(idGenerator); assertNotNull(atomicLong); assertNotNull(atomicReference); assertNotNull(countDownLatch); assertNotNull(semaphore); assertNotNull(lock); assertEquals("map1", map1.getName()); assertEquals("map2", map2.getName()); assertEquals("testMultimap", multiMap.getName()); assertEquals("testQ", queue.getName()); assertEquals("testTopic", topic.getName()); assertEquals("set", set.getName()); assertEquals("list", list.getName()); assertEquals("idGenerator", idGenerator.getName()); assertEquals("atomicLong", atomicLong.getName()); assertEquals("atomicReference", atomicReference.getName()); assertEquals("countDownLatch", countDownLatch.getName()); assertEquals("semaphore", semaphore.getName()); }
@Override public void clearJob(String id) throws Exception { if (id == null) { log.warn("No job to clear; was null, returning"); return; } IAtomicReference<Job> jRef = h.getAtomicReference("job-" + id); if (jRef.isNull()) return; jRef.clear(); log.info("Destroyed job ref " + id); Job remove = null; for (Job j : jobs) { if (j.getWorkerId().equals(id)) { remove = j; break; } } jobs.remove(remove); }
@Override public boolean isDone() { // reason being that isDone() may getFromOrigin called and throw errors // this ensures a safe method call happens and just silently // returns true in case hazelcast is shutdown try { return done.get(); } catch (Exception e) { log.warn("Hazelcast already shutdown...returning true on isDone()"); return true; } }
@Override public void finish() { // reason being that isDone() may getFromOrigin called and throw errors // this ensures a safe method call happens and just silently // returns true in case hazelcast is shutdown try { done.set(true); updateSaver().cleanup(); } catch (Exception e) { log.warn( "Hazelcast already shutdown...done() being applyTransformToDestination is pointless"); } }
/** * Creates a training evaluator using the given neural network * * @param network the neural network to use * @return a training evaluator based on the configuration of the state tracker and the given * network */ @Override public TrainingEvaluator create(BaseMultiLayerNetwork network) { OutputLayerTrainingEvaluator eval = new OutputLayerTrainingEvaluator.Builder() .bestLoss(bestLoss()) .improvementThreshold(improvementThreshold()) .patience(patience()) .testSet(testSet()) .withNetwork(network) .validationEpochs(validationEpochs()) .patienceIncrease(patienceIncrease.get()) .build(); return eval; }
@Override public E getCurrent() throws Exception { E u = (E) master.get(); if (u == null) return null; return u; }
/** * Whether the cluster has begun training * * @return whether the cluster has begun training */ @Override public boolean hasBegun() { return begunTraining.get(); }
@Override public void incrementNumTimesPreTrainRan() { numTimesPretrainRan.set(numTimesPreTrainRun() + 1); }
@Override public int runPreTrainIterations() { return numTimesPretrain.get(); }
@Override public Job jobFor(String id) { IAtomicReference<Job> j = h.getAtomicReference("job-" + id); if (j.isNull() || isCurrentlyJob(id)) return null; return j.get(); }
/** * The number of epochs to test on * * @return the number of epochs to test on */ @Override public int validationEpochs() { return validationEpochs.get(); }
/** * The best validation loss so far * * @return the best validation loss so far */ @Override public double bestLoss() { return bestLoss.get(); }
/** * Improvement threshold for early stopping, aka the minimum * * @return */ @Override public double improvmentThreshold() { return improvementThreshold.get(); }
/** * Patience is what controls early stopping * * @return the patience for the trainer */ @Override public double patience() { return patience.get(); }
/** * The patience improvement to use * * @param improvmentThreshold the patience improvement to applyTransformToDestination */ @Override public void setImprovmentThreshold(double improvmentThreshold) { improvementThreshold.set(improvmentThreshold); }
@Override public boolean isPretrain() { return isPretrain.get(); }
/** * Current mini batch size * * @return */ @Override public int miniBatchSize() { return miniBatchSize.get(); }
/** * Whether to validate against a held out test applyTransformToDestination and test for validation * error. * * @return whether to validate against a held out test applyTransformToDestination and test for * validation error. */ @Override public boolean isEarlyStopTesting() { return earlyStop.get(); }
@Override public void runPreTrainIterations(int numTimes) { numTimesPretrain.set(numTimes); }
/** * The input split to use. This means that each data applyTransformToDestination that is trained * on and loaded will be this batch size or lower per worker * * @return the input split to use */ @Override public int inputSplit() { return (miniBatchSize.get() * numWorkers()) / numWorkers(); }
@Override public int numTimesPreTrainRun() { return numTimesPretrainRan.get(); }
public BaseHazelCastStateTracker(String connectionString, String type, int stateTrackerPort) throws Exception { log.info( "Setting up hazelcast with type " + type + " connection string " + connectionString + " and port " + stateTrackerPort); if (type.equals("master") && !PortTaken.portTaken(stateTrackerPort)) { // sets up a proper connection string for reference wrt external actors needing a reference if (connectionString.equals("master")) { String host = InetAddress.getLocalHost().getHostName(); this.connectionString = host + ":" + stateTrackerPort; } this.hazelCastPort = stateTrackerPort; config = hazelcast(); h = Hazelcast.newHazelcastInstance(config); h.getCluster() .addMembershipListener( new MembershipListener() { @Override public void memberAdded(MembershipEvent membershipEvent) { log.info("Member added " + membershipEvent.toString()); } @Override public void memberRemoved(MembershipEvent membershipEvent) { log.info("Member removed " + membershipEvent.toString()); } @Override public void memberAttributeChanged(MemberAttributeEvent memberAttributeEvent) { log.info("Member changed " + memberAttributeEvent.toString()); } }); } else if (type.equals("master") && PortTaken.portTaken(stateTrackerPort)) throw new IllegalStateException( "Specified type was master and the port specified was taken, please specify a different port"); else { setConnectionString(connectionString); log.info("Connecting to hazelcast on " + connectionString); ClientConfig client = new ClientConfig(); client.getNetworkConfig().addAddress(connectionString); h = HazelcastClient.newHazelcastClient(client); } this.type = type; jobs = h.getList(JOBS); workers = h.getList(WORKERS); // we can make the assumption workers isn't empty because // the master node by default comes with a applyTransformToDestination of workers if (!this.type.equals("master")) { while (workers.isEmpty()) { log.warn("Waiting for data sync..."); Thread.sleep(1000); } log.info("Workers is " + workers.size()); } begunTraining = h.getAtomicReference(BEGUN); miniBatchSize = h.getAtomicReference(INPUT_SPLIT); workerEnabled = h.getMap(WORKER_ENABLED); replicate = h.getList(REPLICATE_WEIGHTS); topics = h.getList(TOPICS); updates = h.getList(UPDATES); heartbeat = h.getMap(HEART_BEAT); master = h.getAtomicReference(RESULT); isPretrain = h.getAtomicReference(IS_PRETRAIN); numTimesPretrain = h.getAtomicReference(NUM_TIMES_RUN_PRETRAIN); numTimesPretrainRan = h.getAtomicReference(NUM_TIMES_PRETRAIN_RAN); done = h.getAtomicReference(DONE); validationEpochs = h.getAtomicReference(VALIDATION_EPOCHS); improvementThreshold = h.getAtomicReference(IMPROVEMENT_THRESHOLD); bestLoss = h.getAtomicReference(BEST_LOSS); earlyStop = h.getAtomicReference(EARLY_STOP); patience = h.getAtomicReference(PATIENCE); patienceIncrease = h.getAtomicReference(PATIENCE_INCREASE); numBatches = h.getAtomicReference(NUM_BATCHES_SO_FAR_RAN); // applyTransformToDestination defaults only when master, otherwise, overrides previous values if (type.equals("master")) { begunTraining.set(false); saver = createUpdateSaver(); numTimesPretrainRan.set(0); numTimesPretrain.set(1); isPretrain.set(true); done.set(false); resource = new StateTrackerDropWizardResource(this); bestLoss.set(Double.POSITIVE_INFINITY); earlyStop.set(true); patience.set(40.0); patienceIncrease.set(2.0); improvementThreshold.set(0.995); validationEpochs.set((int) Math.min(10, patience() / 2)); numBatches.set(0); } }
/** * Increments the number of batches ran. This is purely a count and does not necessarily mean * progress. * * @param numBatchesRan the number of batches ran to increment by */ @Override public void incrementBatchesRan(int numBatchesRan) { numBatches.set(numBatchesRan + numBatches.get()); }
/** * Assuming a job already exists, updates the job * * @param j the job to update */ @Override public void updateJob(Job j) { IAtomicReference<Job> jRef = h.getAtomicReference("job-" + j.getWorkerId()); jRef.set(j); }
/** * Number of batches ran so far * * @return the number of batches ran so far */ @Override public int numBatchesRan() { return numBatches.get(); }
/** Begin training */ @Override public void beginTraining() { begunTraining.set(true); }