public SentenceStatistics mean() { double sumConfidence = 0; int countWithConfidence = 0; Counter<String> avePredictions = new ClassicCounter<>(MapFactory.<String, MutableDouble>linkedHashMapFactory()); // Sum for (SentenceStatistics stat : this.statisticsForClassifiers) { for (Double confidence : stat.confidence) { sumConfidence += confidence; countWithConfidence += 1; } assert Math.abs(stat.relationDistribution.totalCount() - 1.0) < 1e-5; for (Map.Entry<String, Double> entry : stat.relationDistribution.entrySet()) { assert entry.getValue() >= 0.0; assert entry.getValue() == stat.relationDistribution.getCount(entry.getKey()); avePredictions.incrementCount(entry.getKey(), entry.getValue()); assert stat.relationDistribution.getCount(entry.getKey()) == stat.relationDistribution.getCount(entry.getKey()); } } // Normalize double aveConfidence = sumConfidence / ((double) countWithConfidence); // Return if (this.statisticsForClassifiers.size() > 1) { Counters.divideInPlace(avePredictions, (double) this.statisticsForClassifiers.size()); } if (Math.abs(avePredictions.totalCount() - 1.0) > 1e-5) { throw new IllegalStateException("Mean relation distribution is not a distribution!"); } assert this.statisticsForClassifiers.size() > 1 || this.statisticsForClassifiers.size() == 0 || Counters.equals( avePredictions, statisticsForClassifiers.iterator().next().relationDistribution, 1e-5); return countWithConfidence > 0 ? new SentenceStatistics(avePredictions, aveConfidence) : new SentenceStatistics(avePredictions); }
public List<Pair<String, Double>> selectWeightedKeysWithSampling( ActiveLearningSelectionCriterion criterion, int numSamples, int seed) { List<Pair<String, Double>> result = new ArrayList<>(); forceTrack("Sampling Keys"); log("" + numSamples + " to collect"); // Get uncertainty forceTrack("Computing Uncertainties"); Counter<String> weightCounter = uncertainty(criterion); assert weightCounter.equals(uncertainty(criterion)); endTrack("Computing Uncertainties"); // Compute some statistics startTrack("Uncertainty Histogram"); // log(new Histogram(weightCounter, 50).toString()); // removed to make the release easier // (Histogram isn't in CoreNLP) endTrack("Uncertainty Histogram"); double totalCount = weightCounter.totalCount(); Random random = new Random(seed); // Flatten counter List<String> keys = new LinkedList<>(); List<Double> weights = new LinkedList<>(); List<String> zeroUncertaintyKeys = new LinkedList<>(); for (Pair<String, Double> elem : Counters.toSortedListWithCounts( weightCounter, (o1, o2) -> { int value = o1.compareTo(o2); if (value == 0) { return o1.first.compareTo(o2.first); } else { return value; } })) { if (elem.second != 0.0 || weightCounter.totalCount() == 0.0 || weightCounter.size() <= numSamples) { // ignore 0 probability weights keys.add(elem.first); weights.add(elem.second); } else { zeroUncertaintyKeys.add(elem.first); } } // Error check if (Utils.assertionsEnabled()) { for (Double elem : weights) { if (!(elem >= 0 && !Double.isInfinite(elem) && !Double.isNaN(elem))) { throw new IllegalArgumentException("Invalid weight: " + elem); } } } // Sample SAMPLE_ITER: for (int i = 1; i <= numSamples; ++i) { // For each sample if (i % 1000 == 0) { // Debug log log("sampled " + (i / 1000) + "k keys"); // Recompute total count to mitigate floating point errors totalCount = 0.0; for (double val : weights) { totalCount += val; } } if (weights.size() == 0) { continue; } assert totalCount >= 0.0; assert weights.size() == keys.size(); double target = random.nextDouble() * totalCount; Iterator<String> keyIter = keys.iterator(); Iterator<Double> weightIter = weights.iterator(); double runningTotal = 0.0; while (keyIter.hasNext()) { // For each candidate String key = keyIter.next(); double weight = weightIter.next(); runningTotal += weight; if (target <= runningTotal) { // Select that sample result.add(Pair.makePair(key, weight)); keyIter.remove(); weightIter.remove(); totalCount -= weight; continue SAMPLE_ITER; // continue sampling } } // We should get here only if the keys list is empty warn( "No more uncertain samples left to draw from! (target=" + target + " totalCount=" + totalCount + " size=" + keys.size()); assert keys.size() == 0; if (zeroUncertaintyKeys.size() > 0) { result.add(Pair.makePair(zeroUncertaintyKeys.remove(0), 0.0)); } else { break; } } endTrack("Sampling Keys"); return result; }
/** @param args */ public static void main(String[] args) { if (args.length != 3) { System.err.printf( "Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName()); System.exit(-1); } Language language = Language.valueOf(args[0]); TreebankLangParserParams tlpp = language.params; if (language.equals(Language.Arabic)) { String[] options = {"-arabicFactored"}; tlpp.setOptionFlag(options, 0); } else { String[] options = {"-frenchFactored"}; tlpp.setOptionFlag(options, 0); } Treebank tb = tlpp.diskTreebank(); tb.loadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); String[] features = args[2].trim().split(","); for (String feature : features) { morphoSpec.activate(MorphoFeatureType.valueOf(feature)); } // Counters Counter<String> wordTagCounter = new ClassicCounter<>(30000); Counter<String> morphTagCounter = new ClassicCounter<>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); Counter<String> morphCounter = new ClassicCounter<>(500); Counter<String> wordCounter = new ClassicCounter<>(30000); Counter<String> tagCounter = new ClassicCounter<>(300); Counter<String> lemmaCounter = new ClassicCounter<>(25000); Counter<String> lemmaTagCounter = new ClassicCounter<>(25000); Counter<String> richTagCounter = new ClassicCounter<>(1000); Counter<String> reducedTagCounter = new ClassicCounter<>(500); Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500); Map<String, Set<String>> wordLemmaMap = Generics.newHashMap(); TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000); TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500); TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300); int numTrees = 0; for (Tree tree : tb) { for (Tree subTree : tree) { if (!subTree.isLeaf()) { tlpp.transformTree(subTree, tree); } } List<Label> pretermList = tree.preTerminalYield(); List<Label> yield = tree.yield(); assert yield.size() == pretermList.size(); int yieldLen = yield.size(); for (int i = 0; i < yieldLen; ++i) { String tag = pretermList.get(i).value(); String word = yield.get(i).value(); String morph = ((CoreLabel) yield.get(i)).originalText(); // Note: if there is no lemma, then we use the surface form. Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph); String lemma = lemmaTag.first(); String richTag = lemmaTag.second(); // WSGDEBUG if (tag.contains("MW")) lemma += "-MWE"; lemmaCounter.incrementCount(lemma); lemmaTagCounter.incrementCount(lemma + tag); richTagCounter.incrementCount(richTag); String reducedTag = morphoSpec.strToFeatures(richTag).toString(); reducedTagCounter.incrementCount(reducedTag); reducedTagLemmaCounter.incrementCount(reducedTag + lemma); wordTagCounter.incrementCount(word + tag); morphTagCounter.incrementCount(morph + tag); morphCounter.incrementCount(morph); wordCounter.incrementCount(word); tagCounter.incrementCount(tag); reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; if (wordLemmaMap.containsKey(word)) { wordLemmaMap.get(word).add(lemma); } else { Set<String> lemmas = Generics.newHashSet(1); wordLemmaMap.put(word, lemmas); } lemmaReducedTagCounter.incrementCount(lemma, reducedTag); reducedTagTagCounter.incrementCount(lemma + reducedTag, tag); tagReducedTagCounter.incrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.out.println("Language: " + language.toString()); System.out.printf("#trees:\t%d%n", numTrees); System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount()); System.out.printf("#words:\t%d%n", wordCounter.keySet().size()); System.out.printf("#tags:\t%d%n", tagCounter.keySet().size()); System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size()); System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size()); System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size()); System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size()); System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size()); System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size()); System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size()); System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size()); // Extra System.out.println("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) { String word = wordLemmas.getKey(); Set<String> lemmas = wordLemmas.getValue(); if (lemmas.size() == 0) { sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.size() > 1) { sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n"); continue; } String lemma = lemmas.iterator().next(); Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet(); if (reducedTags.size() > 1) { System.out.printf("%s --> %s%n", word, lemma); for (String reducedTag : reducedTags) { int count = lemmaReducedTagCounter.getCount(lemma, reducedTag); String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet()); System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.out.println(); } } System.out.println("=================="); System.out.println(sbNoLemma.toString()); System.out.println(sbMultLemmas.toString()); System.out.println("=================="); List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet()); Collections.sort(tags); for (String tag : tags) { System.out.println(tag); Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet(); for (String reducedTag : reducedTags) { int count = tagReducedTagCounter.getCount(tag, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.out.printf("\t%s\t%d%n", reducedTag, count); } System.out.println(); } System.out.println("=================="); }