private double getDiceCoefficient(String f, String e) {
		  double intersection = collocationCountSentences.getCount(f,e);
		  double cardinalityF = fCountSentences.getCount(f);
		  double cardinalityE = eCountSentences.getCount(e);
		  
		  double dice = 2*intersection / (cardinalityF + cardinalityE);
		  return dice;
	  }
 /** {@inheritDoc} */
 @Override
 public synchronized void incrAllCounters(AbstractCounters<Counter, CounterGroup> other) {
   for (CounterGroup group : other) {
     for (Counter counter : group) {
       findCounter(group.getName(), counter.getName()).increment(counter.getValue());
     }
   }
 }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxConditionalProb = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e));
	    		  if (conditionalGivenEnglish > maxConditionalProb) {
	    			  maxConditionalProb = conditionalGivenEnglish;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
	  private void trainCounters() {
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
		      List<String> englishWords = sentencePair.getEnglishWords();
		      Set<String> frenchSet = new HashSet<String>(frenchWords);
		      Set<String> englishSet = new HashSet<String>(englishWords);
		      
		      fCountSentences.incrementAll(frenchSet, 1.0); 
		      eCountSentences.incrementAll(englishSet, 1.0);
		      
		      for (String f: frenchSet) {
		    	  for (String e: englishSet)
		    		  collocationCountSentences.incrementCount(f, e, 1.0);
		      }
		  }
		  System.out.println("Trained!");
	  }
Exemplo n.º 5
0
  protected void addTag() {
    // add tag for each stmt
    Iterator it = iterator();
    //		int count = 0;

    while (it.hasNext()) {
      JPegStmt stmt = (JPegStmt) it.next();
      int count = Counter.getTagNo();
      //			count++;
      StringTag t = new StringTag(Integer.toString(count));
      stmt.addTag(t);
    }
  }
	  private void trainCounters() {
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
		      List<String> englishWords = sentencePair.getEnglishWords();
		      
		      //fCounts.incrementAll(frenchWords, 1.0); // won't affect the argMax
		      eCounts.incrementAll(englishWords, 1.0);
		      
		      for (String f: frenchWords) {
		    	  for (String e: englishWords)
		    		  collocationCounts.incrementCount(f, e, 1.0);
		      }
		  }
		  System.out.println("Trained!");
	  }
  private Duple<CrownOperations.Reason, ISynset> getEstimatedSynonym(
      String targetLemma, Set<String> synonyms, POS pos, String gloss) {

    Counter<ISynset> synsetCounts = new ObjectCounter<ISynset>();

    List<String> lemmasInWn = new ArrayList<String>();
    for (String lemma : synonyms) {
      // Get the WordNet sysnet if it exists
      Set<ISynset> senses = WordNetUtils.getSynsets(dict, lemma, pos);
      if (senses.isEmpty()) continue;

      lemmasInWn.add(lemma);
      synsetCounts.countAll(senses);

      // Get the hypernyms of the synset and count their occurrence too
      for (ISynset synset : senses) {
        // Do a sanity check that avoids attaching this Entry if its
        // lemma appears anywhere near the synonoyms.  This check
        // potentially has some false positives since we might avoid
        // putting the lemma somewhere valid (in which case it would
        // have more than would valid location) but is used to avoid
        // noisy integration
        if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, synset)) {
          return null;
        }

        for (ISynsetID hyper : synset.getRelatedSynsets(Pointer.HYPERNYM)) {
          ISynset hyperSyn = dict.getSynset(hyper);
          if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, hyperSyn)) {
            return null;
          }
          synsetCounts.count(hyperSyn);
        }
      }
    }

    // Return null if we couldn't find any of the lemma's synonyms or
    // hyponyms in WordNet
    if (synsetCounts.items().isEmpty()) return null;

    // If there was only one lemma in this list in WordNet, try comparing
    // the glosses for just that word to find a match
    if (lemmasInWn.size() == 1) {
      double maxScore = 0;
      ISynset best = null;
      String bestGloss = null;
      Set<ISynset> candidateSynonymSynsets = WordNetUtils.getSynsets(dict, lemmasInWn.get(0), pos);
      for (ISynset candidate : candidateSynonymSynsets) {

        String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate);
        double score = simFunc.compare(gloss, wnExtendedGloss);
        if (maxScore < score) {
          maxScore = score;
          best = candidate;
          bestGloss = wnExtendedGloss;
        }
      }

      CrownOperations.Reason r = new CrownOperations.Reason(getClass());
      r.set("relation_type", "synonym");
      r.set("heuristic", "single-synonym");
      r.set("max_score", maxScore);
      return new Duple<CrownOperations.Reason, ISynset>(r, best);
    } else {
      // Check for whether there were ties in the max
      ISynset mostFreq = synsetCounts.max();
      int mostFreqCount = synsetCounts.getCount(mostFreq);
      List<ISynset> ties = new ArrayList<ISynset>();
      for (ISynset syn : synsetCounts.items()) {
        int c = synsetCounts.getCount(syn);
        if (c == mostFreqCount) ties.add(syn);
      }

      // If there was only one synset that had the maximum count, then we
      // report this
      if (ties.size() == 1) {

        CrownOperations.Reason r = new CrownOperations.Reason(getClass());
        r.set("relation_type", "synonym");
        r.set("heuristic", "unambiguous-max");
        r.set("count", mostFreqCount);
        return new Duple<CrownOperations.Reason, ISynset>(r, mostFreq);
      }
      // Otherwise, we try breaking ties between the synsets using gloss
      // similarity
      else {

        double maxScore = 0;
        ISynset best = null;
        String bestGloss = null;
        for (ISynset candidate : ties) {
          String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate);
          double score = simFunc.compare(gloss, wnExtendedGloss);
          if (maxScore < score) {
            maxScore = score;
            best = candidate;
            bestGloss = wnExtendedGloss;
          }
        }

        CrownOperations.Reason r = new CrownOperations.Reason(getClass());
        r.set("relation_type", "synonym");
        r.set("heuristic", "tied-synonyms");
        r.set("max_score", maxScore);
        return new Duple<CrownOperations.Reason, ISynset>(r, best);
      }
    }
  }
	  private CounterMap<String,String> trainEM(int maxIterations) {
		  Set<String> englishVocab = new HashSet<String>();
		  Set<String> frenchVocab = new HashSet<String>();
		  
		  CounterMap<String,String> translations = new CounterMap<String,String>();
		  englishVocab.add(NULL);
		  int iteration = 0;
		  final double thresholdProb = 0.0001;
		  
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
			  List<String> englishWords = sentencePair.getEnglishWords();
			  // add words from list to vocabulary sets
			  englishVocab.addAll(englishWords);
			  frenchVocab.addAll(frenchWords);
		  }
		  System.out.println("Ready");
		  
		  // We need to initialize translations.getCount(f,e) uniformly
		  // t(f|e) summed over all e in {E + NULL} = 1
		  final double initialCount = 1.0 / englishVocab.size();
		  
		  while(iteration < maxIterations) {
			  CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f
			  Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e
			  
			  // E-step: loop over all sentences and update counts
			  for (SentencePair sentencePair : trainingSentencePairs) {
				  List<String> frenchWords = sentencePair.getFrenchWords();
				  List<String> englishWords = sentencePair.getEnglishWords();
				  
			      int numFrenchWords = frenchWords.size();
			      int numEnglishWords = englishWords.size();
			      Counter<String> sTotalF = new Counter<String>(); 
			      
			      // compute normalization constant sTotalF
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  // initialize and compute for English = NULL
			    	  if (!translations.containsKey(f) && initialize)
			    		  translations.setCount(f, NULL, initialCount);
			    	  else if (!translations.containsKey(f))
			    		  translations.setCount(f, NULL, thresholdProb);
			    	  sTotalF.incrementCount(f, translations.getCount(f, NULL)); 
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  if (!(translations.getCounter(f)).containsKey(e) && initialize)
			    			  translations.setCount(f, e, initialCount);
			    		  else if (!(translations.getCounter(f)).containsKey(e))
			    			  translations.setCount(f, e, thresholdProb);
			    		  sTotalF.incrementCount(f, translations.getCount(f, e));
			    	  }
			      }
			      
			      // collect counts in counts and totalEnglish
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  
			    	  // collect counts for English = NULL
			    	  double count = translations.getCount(f, NULL) / sTotalF.getCount(f);
			    	  counts.incrementCount(NULL, f, count);
			    	  totalEnglish.incrementCount(NULL, count);
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  count = translations.getCount(f, e) / sTotalF.getCount(f);
			    		  counts.incrementCount(e, f, count);
			    		  totalEnglish.incrementCount(e, count);
			    	  }
			      }
			  } // end of E-step
			  System.out.println("Completed E-step");
			  
			  // M-step: update probabilities with counts from E-step and check for convergence
			  iteration++;
			  for (String e : counts.keySet()) {//englishVocab) {
				  double normalizer = totalEnglish.getCount(e);
				  for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) {
					  
					  // To speed implementation, we want to update translations only when count / normalizer > threshold
					  double prob = counts.getCount(e, f) / normalizer;
					  if (!initialize) {					  
						  if (prob > thresholdProb)
							  translations.setCount(f, e, prob);
						  else
							  (translations.getCounter(f)).removeKey(e);
					  }
					  else {
						  translations.setCount(f, e, prob);
					  }
				  }
			  }
			  System.out.println("Completed iteration " + iteration);
		  } // end of M-step
		  
		  System.out.println("Trained!");
		  return translations;
	  }