private double getDiceCoefficient(String f, String e) { double intersection = collocationCountSentences.getCount(f,e); double cardinalityF = fCountSentences.getCount(f); double cardinalityE = eCountSentences.getCount(e); double dice = 2*intersection / (cardinalityF + cardinalityE); return dice; }
/** {@inheritDoc} */ @Override public synchronized void incrAllCounters(AbstractCounters<Counter, CounterGroup> other) { for (CounterGroup group : other) { for (Counter counter : group) { findCounter(group.getName(), counter.getName()).increment(counter.getValue()); } } }
public Alignment alignSentencePair(SentencePair sentencePair) { Alignment alignment = new Alignment(); List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); int englishMaxPosition = frenchPosition; if (englishMaxPosition >= numEnglishWords) englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words double maxConditionalProb = 0; for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e)); if (conditionalGivenEnglish > maxConditionalProb) { maxConditionalProb = conditionalGivenEnglish; englishMaxPosition = englishPosition; } } alignment.addAlignment(englishMaxPosition, frenchPosition, true); } return alignment; }
private void trainCounters() { for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); Set<String> frenchSet = new HashSet<String>(frenchWords); Set<String> englishSet = new HashSet<String>(englishWords); fCountSentences.incrementAll(frenchSet, 1.0); eCountSentences.incrementAll(englishSet, 1.0); for (String f: frenchSet) { for (String e: englishSet) collocationCountSentences.incrementCount(f, e, 1.0); } } System.out.println("Trained!"); }
protected void addTag() { // add tag for each stmt Iterator it = iterator(); // int count = 0; while (it.hasNext()) { JPegStmt stmt = (JPegStmt) it.next(); int count = Counter.getTagNo(); // count++; StringTag t = new StringTag(Integer.toString(count)); stmt.addTag(t); } }
private void trainCounters() { for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); //fCounts.incrementAll(frenchWords, 1.0); // won't affect the argMax eCounts.incrementAll(englishWords, 1.0); for (String f: frenchWords) { for (String e: englishWords) collocationCounts.incrementCount(f, e, 1.0); } } System.out.println("Trained!"); }
private Duple<CrownOperations.Reason, ISynset> getEstimatedSynonym( String targetLemma, Set<String> synonyms, POS pos, String gloss) { Counter<ISynset> synsetCounts = new ObjectCounter<ISynset>(); List<String> lemmasInWn = new ArrayList<String>(); for (String lemma : synonyms) { // Get the WordNet sysnet if it exists Set<ISynset> senses = WordNetUtils.getSynsets(dict, lemma, pos); if (senses.isEmpty()) continue; lemmasInWn.add(lemma); synsetCounts.countAll(senses); // Get the hypernyms of the synset and count their occurrence too for (ISynset synset : senses) { // Do a sanity check that avoids attaching this Entry if its // lemma appears anywhere near the synonoyms. This check // potentially has some false positives since we might avoid // putting the lemma somewhere valid (in which case it would // have more than would valid location) but is used to avoid // noisy integration if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, synset)) { return null; } for (ISynsetID hyper : synset.getRelatedSynsets(Pointer.HYPERNYM)) { ISynset hyperSyn = dict.getSynset(hyper); if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, hyperSyn)) { return null; } synsetCounts.count(hyperSyn); } } } // Return null if we couldn't find any of the lemma's synonyms or // hyponyms in WordNet if (synsetCounts.items().isEmpty()) return null; // If there was only one lemma in this list in WordNet, try comparing // the glosses for just that word to find a match if (lemmasInWn.size() == 1) { double maxScore = 0; ISynset best = null; String bestGloss = null; Set<ISynset> candidateSynonymSynsets = WordNetUtils.getSynsets(dict, lemmasInWn.get(0), pos); for (ISynset candidate : candidateSynonymSynsets) { String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate); double score = simFunc.compare(gloss, wnExtendedGloss); if (maxScore < score) { maxScore = score; best = candidate; bestGloss = wnExtendedGloss; } } CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "single-synonym"); r.set("max_score", maxScore); return new Duple<CrownOperations.Reason, ISynset>(r, best); } else { // Check for whether there were ties in the max ISynset mostFreq = synsetCounts.max(); int mostFreqCount = synsetCounts.getCount(mostFreq); List<ISynset> ties = new ArrayList<ISynset>(); for (ISynset syn : synsetCounts.items()) { int c = synsetCounts.getCount(syn); if (c == mostFreqCount) ties.add(syn); } // If there was only one synset that had the maximum count, then we // report this if (ties.size() == 1) { CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "unambiguous-max"); r.set("count", mostFreqCount); return new Duple<CrownOperations.Reason, ISynset>(r, mostFreq); } // Otherwise, we try breaking ties between the synsets using gloss // similarity else { double maxScore = 0; ISynset best = null; String bestGloss = null; for (ISynset candidate : ties) { String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate); double score = simFunc.compare(gloss, wnExtendedGloss); if (maxScore < score) { maxScore = score; best = candidate; bestGloss = wnExtendedGloss; } } CrownOperations.Reason r = new CrownOperations.Reason(getClass()); r.set("relation_type", "synonym"); r.set("heuristic", "tied-synonyms"); r.set("max_score", maxScore); return new Duple<CrownOperations.Reason, ISynset>(r, best); } } }
private CounterMap<String,String> trainEM(int maxIterations) { Set<String> englishVocab = new HashSet<String>(); Set<String> frenchVocab = new HashSet<String>(); CounterMap<String,String> translations = new CounterMap<String,String>(); englishVocab.add(NULL); int iteration = 0; final double thresholdProb = 0.0001; for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); // add words from list to vocabulary sets englishVocab.addAll(englishWords); frenchVocab.addAll(frenchWords); } System.out.println("Ready"); // We need to initialize translations.getCount(f,e) uniformly // t(f|e) summed over all e in {E + NULL} = 1 final double initialCount = 1.0 / englishVocab.size(); while(iteration < maxIterations) { CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e // E-step: loop over all sentences and update counts for (SentencePair sentencePair : trainingSentencePairs) { List<String> frenchWords = sentencePair.getFrenchWords(); List<String> englishWords = sentencePair.getEnglishWords(); int numFrenchWords = frenchWords.size(); int numEnglishWords = englishWords.size(); Counter<String> sTotalF = new Counter<String>(); // compute normalization constant sTotalF for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // initialize and compute for English = NULL if (!translations.containsKey(f) && initialize) translations.setCount(f, NULL, initialCount); else if (!translations.containsKey(f)) translations.setCount(f, NULL, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, NULL)); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); if (!(translations.getCounter(f)).containsKey(e) && initialize) translations.setCount(f, e, initialCount); else if (!(translations.getCounter(f)).containsKey(e)) translations.setCount(f, e, thresholdProb); sTotalF.incrementCount(f, translations.getCount(f, e)); } } // collect counts in counts and totalEnglish for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) { String f = frenchWords.get(frenchPosition); // collect counts for English = NULL double count = translations.getCount(f, NULL) / sTotalF.getCount(f); counts.incrementCount(NULL, f, count); totalEnglish.incrementCount(NULL, count); for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) { String e = englishWords.get(englishPosition); count = translations.getCount(f, e) / sTotalF.getCount(f); counts.incrementCount(e, f, count); totalEnglish.incrementCount(e, count); } } } // end of E-step System.out.println("Completed E-step"); // M-step: update probabilities with counts from E-step and check for convergence iteration++; for (String e : counts.keySet()) {//englishVocab) { double normalizer = totalEnglish.getCount(e); for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) { // To speed implementation, we want to update translations only when count / normalizer > threshold double prob = counts.getCount(e, f) / normalizer; if (!initialize) { if (prob > thresholdProb) translations.setCount(f, e, prob); else (translations.getCounter(f)).removeKey(e); } else { translations.setCount(f, e, prob); } } } System.out.println("Completed iteration " + iteration); } // end of M-step System.out.println("Trained!"); return translations; }