/** * Inserts the classifier in the population. Before that, it looks if there is a classifier in the * population with the same action and condition (in this case, increments its numerosity). After, * it checks that the number of micro classifiers is less than the maximum population size. If it * isn't, it deletes one classifier from the population calling the deleteClassifier function. It * inserts the classifier in the population and in the action set if it's not null. * * @param cl is the classifier that has to be inserted in the population. * @param ASet Population where the classifier will be inserted. */ public void insertInPopulation(Classifier cl, Population ASet) { boolean found = false; int i = 0; while (i < macroClSum && !found) { if (set[i].equals(cl)) { set[i].increaseNumerosity(cl.getNumerosity()); microClSum += cl.getNumerosity(); if (ASet != null) { if (ASet.isThereClassifier(set[i]) >= 0) ASet.microClSum += cl.getNumerosity(); } found = true; } i++; } if (!found) { addClassifier(cl); } // Here, the classifier has been added to the population if (microClSum > Config.popSize) { // If we have inserted to many classifiers, we have to delete one. deleteClFromPopulation(ASet); } } // end insertInPopulation
@Override public Map<LookupElement, StringBuilder> getRelevanceStrings() { final LinkedHashMap<LookupElement, StringBuilder> map = new LinkedHashMap<LookupElement, StringBuilder>(); for (LookupElement item : myItems) { map.put(item, new StringBuilder()); } final MultiMap<CompletionSorterImpl, LookupElement> inputBySorter = groupItemsBySorter(new ArrayList<LookupElement>(map.keySet())); if (inputBySorter.size() > 1) { for (LookupElement element : map.keySet()) { map.get(element).append(obtainSorter(element)).append(": "); } } for (CompletionSorterImpl sorter : inputBySorter.keySet()) { final LinkedHashMap<LookupElement, StringBuilder> subMap = new LinkedHashMap<LookupElement, StringBuilder>(); for (LookupElement element : inputBySorter.get(sorter)) { subMap.put(element, map.get(element)); } Classifier<LookupElement> classifier = myClassifiers.get(sorter); if (classifier != null) { classifier.describeItems(subMap, createContext(false)); } } return map; }
/** * Inserts the classifier into the population. Before, it looks if there is a classifier in the * population that can subsume the new one (in this case, increments its numerosity). After, it * checks that the number of micro classifiers is less than the maximum population size. If it * isn't, it deletes one classifier of the population calling the deleteClassifier function. It * inserts the classifier in the population and in the action set if it's not null. * * @param cl is the classifier that has to be inserted in the population. * @param ASet Population where the classifier will be inserted. */ public void insertInPSubsumingCl(Classifier cl, Population ASet) { int i = 0; Classifier bestSubsumer = null; Classifier equalClassifier = null; // We look for the best subsumer or for an equal classifier. while (i < macroClSum) { if (set[i].couldSubsume() && set[i].isMoreGeneral(cl)) { if (bestSubsumer == null) bestSubsumer = set[i]; else if (set[i].isMoreGeneral(bestSubsumer)) bestSubsumer = set[i]; } if (set[i].equals(cl)) equalClassifier = set[i]; i++; } // If there is a subsumer, its numerosity is increased. if (bestSubsumer != null) { bestSubsumer.increaseNumerosity(cl.getNumerosity()); microClSum += cl.getNumerosity(); } else if (equalClassifier != null) { equalClassifier.increaseNumerosity(cl.getNumerosity()); microClSum += cl.getNumerosity(); } else { addClassifier(cl); } // There's no classifier deletion, independent of if the maximum size // has been overcomen } // end insertInPSubsumingCl
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { if (instance.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } double[] sums = new double[instance.numClasses()], newProbs; Classifier curr; for (int i = 0; i < m_Committee.size(); i++) { curr = (Classifier) m_Committee.get(i); newProbs = curr.distributionForInstance(instance); for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j]; } if (Utils.eq(Utils.sum(sums), 0)) { return sums; } else { Utils.normalize(sums); return sums; } }
/** * Deletes one classifier from the population. After that, if the population passed as a parameter * is not null, it looks for the deleted classifier. If it is in the second population, it will * delete it too. * * @param aSet is the population where the deleted classifier has to be searched. * @return a Classifier that contains the deleted classifier. */ public Classifier deleteClFromPopulation(Population aSet) { // A classifier has been deleted from the population Classifier clDeleted = deleteClassifier(); if (aSet != null) { // Now, this classifier has to be deleted from the action set (if it exists in). int pos = aSet.isThereClassifier(clDeleted); // It is searched in the action set. if (pos >= 0) { // It has to be deleted from the action set too. aSet.microClSum--; // If the classifier has 0 numerosity, we remove it from the population. if (clDeleted.getNumerosity() == 0) { // It has to be completely deleted from action set. aSet.macroClSum--; // Decrements the number of macroclassifiers aSet.set[pos] = aSet.set[aSet.macroClSum]; // Moves the last classifier to the deleted one aSet.set[aSet.macroClSum] = null; // Puts the last classifier to null. } } } return clDeleted; } // end deleteClFromPopulation
/** This method applies the action set subsumption */ public void doActionSetSubsumption() { int i, pos = 0; Classifier cl = null; for (i = 0; i < macroClSum; i++) { if (set[i].couldSubsume()) { if (cl == null || set[i].numberOfDontCareSymbols() > cl.numberOfDontCareSymbols() || (set[i].numberOfDontCareSymbols() == cl.numberOfDontCareSymbols() && Config.rand() < 0.5)) { cl = set[i]; pos = i; } } } if (cl != null) { for (i = 0; i < macroClSum; i++) { if (cl != set[i] && cl.isMoreGeneral(set[i])) { cl.increaseNumerosity(set[i].getNumerosity()); // Now, the classifier has to be removed from the actionSet and the population. // It's deleted from the action set. Classifier clDeleted = set[i]; deleteClassifier(i); // And now, it's deleted from the population Population p = parentRef; while (p.parentRef != null) { p = p.parentRef; } pos = p.isThereClassifier( clDeleted); // The classifier is searched in the initial population. if (pos >= 0) p.deleteClassifier(pos); } } } } // end doActionSetSubsumption
@Override public void addElement( Lookup lookup, LookupElement element, LookupElementPresentation presentation) { StatisticsWeigher.clearBaseStatisticsInfo(element); final String invariant = presentation.getItemText() + "###" + getTailTextOrSpace(presentation) + "###" + presentation.getTypeText(); element.putUserData(PRESENTATION_INVARIANT, invariant); CompletionSorterImpl sorter = obtainSorter(element); Classifier<LookupElement> classifier = myClassifiers.get(sorter); if (classifier == null) { myClassifiers.put(sorter, classifier = sorter.buildClassifier(new AlphaClassifier(lookup))); } classifier.addElement(element); super.addElement(lookup, element, presentation); }
/** * Returns if the classifier of the class subsumes the classifier passed as a parameter. * * @param cl is the subsumed classifier. * @return a boolean indicating if it subsumes */ public boolean doesSubsume(Classifier cl) { int i; // First, check if the condition is the same if (action != cl.getAction()) return false; // Then, check that is more general if (parameters.couldSubsume()) { for (i = 0; i < rep.length; i++) { if (!rep[i].isMoreGeneral(cl.rep[i])) return false; } return true; } return false; } // end doesSubsume
/** * Adds a classifier in the population. * * @param cl is the new classifier to be added. */ public void addClassifier(Classifier cl) { try { set[macroClSum] = cl; microClSum += cl.getNumerosity(); macroClSum++; } catch (Exception e) { System.out.println( "Exception in the insertion of a new classifier. The macroClSum is : " + macroClSum + " and the microClsum: " + microClSum); System.out.println( "And the maximum number of classifiers in the population is: " + Config.popSize); e.printStackTrace(); } } // end addClassifier
/** * Returns if the classifier of the class is equal to the classifier given as a parameter. * * @param cl is a classifier. * @return a boolean indicating if they are equals. */ public boolean equals(Classifier cl) { int i; try { // Checking the action if (action != cl.getAction()) return false; // Checking the condition for (i = 0; i < rep.length; i++) { if (!rep[i].equals(cl.rep[i])) return false; } return true; } catch (Exception e) { return false; } } // end equals
/** * Applies crossover. It generates two children. * * @param parent1 is the first parent. * @param parent2 is the second parent * @param child1 is the first child * @param child2 is the second child. */ public void makeCrossover( Classifier parent1, Classifier parent2, Classifier child1, Classifier child2) { int i = 0; // cross1 is a number between [0.. clLength-1] int cross1 = (int) (Config.rand() * (double) Config.clLength); // cross2 is a number between [1..clLenght]. int cross2 = (int) (Config.rand() * (double) Config.clLength) + 1; if (cross1 > cross2) { int aux = cross2; cross2 = cross1; cross1 = aux; // In the else-if condition is not necessary to check if (cross2<clLength) // to increment the point, because cross1 [0..length-1] } else if (cross1 == cross2) cross2++; // All the intervals (real representation) or genes (ternary representation) that // are not in the cross point are crossed. if (!Config.ternaryRep) { for (i = cross1 + 1; i < cross2 - 1; i++) { child2.setAllele(i, parent1); child1.setAllele(i, parent2); } // Now we have to cross the border allele child1.crossAllele(cross1, parent1, parent2); child1.crossAllele(cross2 - 1, parent2, parent1); child2.crossAllele(cross1, parent2, parent1); child2.crossAllele(cross2 - 1, parent1, parent2); } else { for (i = cross1; i < cross2 - 1; i++) { child2.setAllele(i, parent1); child1.setAllele(i, parent2); } } } // end makeCrossover
/** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if (data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if (m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); // initialize random number generator if (m_Seed == -1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1; // current committee size int numTrials = 1; // number of Decorate iterations Instances divData = new Instances(data); // local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null; // artificial data // compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize) * divData.numInstances()); if (artSize == 0) artSize = 1; // atleast add one random example computeStats(data); // Compute training data stats for creating artificial examples // initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData); // compute ensemble error if (m_Debug) System.out.println( "Initialize:\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); // repeat till desired committee size is reached OR the max number of iterations is exceeded while (i < m_DesiredSize && numTrials < m_NumIterations) { // Generate artificial training examples artData = generateArtificialData(artSize, data); // Label artificial examples labelData(artData); addInstances(divData, artData); // Add new artificial data // Build new classifier Classifier tmp[] = Classifier.makeCopies(m_Classifier, 1); newClassifier = tmp[0]; newClassifier.buildClassifier(divData); // Remove all the artificial data removeInstances(divData, artSize); // Test if the new classifier should be added to the ensemble m_Committee.add(newClassifier); // add new classifier to current committee double currError = computeError(divData); if (currError <= eComm) { // adding the new member did not increase the error i++; eComm = currError; if (m_Debug) System.out.println( "Iteration: " + (1 + numTrials) + "\tClassifier " + i + " added to ensemble. Ensemble error = " + eComm); } else { // reject the current classifier because it increased the ensemble error m_Committee.removeElementAt(m_Committee.size() - 1); // pop the last member } numTrials++; } }
/** * The core implementation of the search. * * @param root The root word to search from. Traditionally, this is the root of the sentence. * @param candidateFragments The callback for the resulting sentence fragments. This is a * predicate of a triple of values. The return value of the predicate determines whether we * should continue searching. The triple is a triple of * <ol> * <li>The log probability of the sentence fragment, according to the featurizer and the * weights * <li>The features along the path to this fragment. The last element of this is the * features from the most recent step. * <li>The sentence fragment. Because it is relatively expensive to compute the resulting * tree, this is returned as a lazy {@link Supplier}. * </ol> * * @param classifier The classifier for whether an arc should be on the path to a clause split, a * clause split itself, or neither. * @param featurizer The featurizer to use. Make sure this matches the weights! * @param actionSpace The action space we are allowed to take. Each action defines a means of * splitting a clause on a dependency boundary. */ protected void search( // The root to search from IndexedWord root, // The output specs final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>> candidateFragments, // The learning specs final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, Map<String, ? extends List<String>> hardCodedSplits, final Function<Triple<State, Action, State>, Counter<String>> featurizer, final Collection<Action> actionSpace, final int maxTicks) { // (the fringe) PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>(); // (avoid duplicate work) Set<IndexedWord> seenWords = new HashSet<>(); State firstState = new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done" fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0); int ticks = 0; while (!fringe.isEmpty()) { if (++ticks > maxTicks) { // System.err.println("WARNING! Timed out on search with " + ticks + " ticks"); return; } // Useful variables double logProbSoFar = fringe.getPriority(); assert logProbSoFar <= 0.0; Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst(); State lastState = lastStatePair.first; List<Counter<String>> featuresSoFar = lastStatePair.second; IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent(); // Register thunk if (lastState.isDone) { if (!candidateFragments.test( Triple.makeTriple( logProbSoFar, featuresSoFar, () -> { SemanticGraph copy = new SemanticGraph(tree); lastState .thunk .andThen( x -> { // Add the extra edges back in, if they don't break the tree-ness of the // extraction for (IndexedWord newTreeRoot : x.getRoots()) { if (newTreeRoot != null) { // what a strange thing to have happen... for (SemanticGraphEdge extraEdge : extraEdgesByGovernor.get(newTreeRoot)) { assert Util.isTree(x); //noinspection unchecked addSubtree( x, newTreeRoot, extraEdge.getRelation().toString(), tree, extraEdge.getDependent(), tree.getIncomingEdgesSorted(newTreeRoot)); assert Util.isTree(x); } } } }) .accept(copy); return new SentenceFragment(copy, assumedTruth, false); }))) { break; } } // Find relevant auxilliary terms SemanticGraphEdge subjOrNull = null; SemanticGraphEdge objOrNull = null; for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) { String relString = auxEdge.getRelation().toString(); if (relString.contains("obj")) { objOrNull = auxEdge; } else if (relString.contains("subj")) { subjOrNull = auxEdge; } } // Iterate over children // For each outgoing edge... for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) { // Prohibit indirect speech verbs from splitting off clauses // (e.g., 'said', 'think') // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp if (outgoingEdge.getRelation().toString().equals("ccomp") && ((outgoingEdge.getGovernor().lemma() != null && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma())) || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) { continue; } // Get some variables String outgoingEdgeRelation = outgoingEdge.getRelation().toString(); List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation); if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) { forcedArcOrder = hardCodedSplits.get( outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*"); } boolean doneForcedArc = false; // For each action... for (Action action : (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) { // Check the prerequisite if (!action.prerequisitesMet(tree, outgoingEdge)) { continue; } if (forcedArcOrder != null && doneForcedArc) { break; } // 1. Compute the child state Optional<State> candidate = action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull); if (candidate.isPresent()) { double logProbability; ClauseClassifierLabel bestLabel; Counter<String> features = featurizer.apply(Triple.makeTriple(lastState, action, candidate.get())); if (forcedArcOrder != null && !doneForcedArc) { logProbability = 0.0; bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT; doneForcedArc = true; } else if (features.containsKey("__undocumented_junit_no_classifier")) { logProbability = Double.NEGATIVE_INFINITY; bestLabel = ClauseClassifierLabel.CLAUSE_INTERM; } else { Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features)); if (scores.size() > 0) { Counters.logNormalizeInPlace(scores); } String rel = outgoingEdge.getRelation().toString(); if ("nsubj".equals(rel) || "dobj".equals(rel)) { scores.remove( ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj } logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY); bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT); } if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) { Pair<State, List<Counter<String>>> childState = Pair.makePair( candidate.get().withIsDone(bestLabel), new ArrayList<Counter<String>>(featuresSoFar) { { add(features); } }); // 2. Register the child state if (!seenWords.contains(childState.first.edge.getDependent())) { // System.err.println(" pushing " + action.signature() + " with " + // argmax.first.edge); fringe.add(childState, logProbability); } } } } } seenWords.add(rootWord); } // System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + " // classifier evaluations."); }
@Override public void trainC(final ClassificationDataSet dataSet, final ExecutorService threadPool) { final PriorityQueue<ClassificationModelEvaluation> bestModels = new PriorityQueue<ClassificationModelEvaluation>( folds, new Comparator<ClassificationModelEvaluation>() { @Override public int compare( ClassificationModelEvaluation t, ClassificationModelEvaluation t1) { double v0 = t.getScoreStats(classificationTargetScore).getMean(); double v1 = t1.getScoreStats(classificationTargetScore).getMean(); int order = classificationTargetScore.lowerIsBetter() ? 1 : -1; return order * Double.compare(v0, v1); } }); /** * Use this to keep track of which parameter we are altering. Index correspondence to the * parameter, and its value corresponds to which value has been used. Increment and carry counts * to iterate over all possible combinations. */ int[] setTo = new int[searchParams.size()]; /** * Each model is set to have different combination of parameters. We then train each model to * determine the best one. */ final List<Classifier> paramsToEval = new ArrayList<Classifier>(); while (true) { setParameters(setTo); paramsToEval.add(baseClassifier.clone()); if (incrementCombination(setTo)) break; } /* * This is the Executor used for training the models in parallel. If we * are not supposed to do that, it will be an executor that executes * them sequentually. */ final ExecutorService modelService; if (trainModelsInParallel) modelService = threadPool; else modelService = new FakeExecutor(); final CountDownLatch latch; // used for stopping in both cases // if we are doing our CV splits ahead of time, get them done now final List<ClassificationDataSet> preFolded; /** Pre-combine our training combinations so that any caching can be re-used */ final List<ClassificationDataSet> trainCombinations; if (reuseSameCVFolds) { preFolded = dataSet.cvSet(folds); trainCombinations = new ArrayList<ClassificationDataSet>(preFolded.size()); for (int i = 0; i < preFolded.size(); i++) trainCombinations.add(ClassificationDataSet.comineAllBut(preFolded, i)); } else { preFolded = null; trainCombinations = null; } boolean considerWarm = useWarmStarts && baseClassifier instanceof WarmClassifier; /** * make sure we don't do a warm start if its only supported when trained on the same data but we * aren't reuse-ing the same CV splits So we get the truth table * * <p>a | b | (a&&b)||¬a T | T | T T | F | F F | T | T F | F | T * * <p>where a = warmFromSameDataOnly and b = reuseSameSplit So we can instead use ¬ a || b */ if (considerWarm && (!((WarmClassifier) baseClassifier).warmFromSameDataOnly() || reuseSameCVFolds)) { /* we want all of the first parameter (which is the warm paramter, * taken care of for us) values done in a group. So We can get this * by just dividing up the larger list into sub lists, each sub list * is adjacent in the original and is the number of parameter values * we wanted to try */ int stepSize = searchValues.get(0).size(); int totalJobs = paramsToEval.size() / stepSize; latch = new CountDownLatch(totalJobs); for (int startPos = 0; startPos < paramsToEval.size(); startPos += stepSize) { final List<Classifier> subSet = paramsToEval.subList(startPos, startPos + stepSize); modelService.submit( new Runnable() { @Override public void run() { Classifier[] prevModels = null; for (Classifier c : subSet) { ClassificationModelEvaluation cme = trainModelsInParallel ? new ClassificationModelEvaluation(c, dataSet) : new ClassificationModelEvaluation(c, dataSet, threadPool); cme.setKeepModels(true); // we need these to do warm starts! cme.setWarmModels(prevModels); cme.addScorer(classificationTargetScore.clone()); if (reuseSameCVFolds) cme.evaluateCrossValidation(preFolded, trainCombinations); else cme.evaluateCrossValidation(folds); prevModels = cme.getKeptModels(); synchronized (bestModels) { bestModels.add(cme); } } latch.countDown(); } }); } } else // regular CV, train a new model from scratch at every step { latch = new CountDownLatch(paramsToEval.size()); for (final Classifier toTrain : paramsToEval) { modelService.submit( new Runnable() { @Override public void run() { ClassificationModelEvaluation cme = trainModelsInParallel ? new ClassificationModelEvaluation(toTrain, dataSet) : new ClassificationModelEvaluation(toTrain, dataSet, threadPool); cme.addScorer(classificationTargetScore.clone()); if (reuseSameCVFolds) cme.evaluateCrossValidation(preFolded, trainCombinations); else cme.evaluateCrossValidation(folds); synchronized (bestModels) { bestModels.add(cme); } latch.countDown(); } }); } } // now wait for everyone to finish try { latch.await(); // Now we know the best classifier, we need to train one on the whole data set. Classifier bestClassifier = bestModels.peek().getClassifier(); // Just re-train it on the whole set if (trainFinalModel) { // try and warm start the final model if we can if (useWarmStarts && bestClassifier instanceof WarmClassifier && !((WarmClassifier) bestClassifier) .warmFromSameDataOnly()) // last line here needed to make sure we can do this warm // train { WarmClassifier wc = (WarmClassifier) bestClassifier; if (threadPool instanceof FakeExecutor) wc.trainC(dataSet, wc.clone()); else wc.trainC(dataSet, wc.clone(), threadPool); } else { if (threadPool instanceof FakeExecutor) bestClassifier.trainC(dataSet); else bestClassifier.trainC(dataSet, threadPool); } } trainedClassifier = bestClassifier; } catch (InterruptedException ex) { Logger.getLogger(GridSearch.class.getName()).log(Level.SEVERE, null, ex); } }