Esempio n. 1
0
  /**
   * GT smoothing with least squares interpolation. This follows the procedure in Jurafsky and
   * Martin sect. 4.5.3.
   */
  public void smoothAndNormalize() {
    Counter<Integer> cntCounter = new Counter<Integer>();
    for (K tok : lm.keySet()) {
      int cnt = (int) lm.getCount(tok);
      cntCounter.incrementCount(cnt);
    }

    final double[] coeffs = runLogSpaceRegression(cntCounter);

    UNK_PROB = cntCounter.getCount(1) / lm.totalCount();

    for (K tok : lm.keySet()) {
      double tokCnt = lm.getCount(tok);
      if (tokCnt <= unkCutoff) // Treat as unknown
      unkTokens.add(tok);
      if (tokCnt <= kCutoff) { // Smooth
        double cSmooth = katzEstimate(cntCounter, tokCnt, coeffs);
        lm.setCount(tok, cSmooth);
      }
    }

    // Normalize
    // Counters.normalize(lm);
    // MY COUNTER IS ALWAYS NORMALIZED AND AWESOME
  }
	  private double getDiceCoefficient(String f, String e) {
		  double intersection = collocationCountSentences.getCount(f,e);
		  double cardinalityF = fCountSentences.getCount(f);
		  double cardinalityE = eCountSentences.getCount(e);
		  
		  double dice = 2*intersection / (cardinalityF + cardinalityE);
		  return dice;
	  }
 /* Returns a smoothed estimate of P(word|tag) */
 public double scoreTagging(String word, String tag) {
   double p_tag = tagCounter.getCount(tag) / totalTokens;
   double c_word = wordCounter.getCount(word);
   double c_tag_and_word = wordToTagCounters.getCount(word, tag);
   if (c_word < 10) { // rare or unknown
     c_word += 1.0;
     c_tag_and_word += typeTagCounter.getCount(tag) / totalWordTypes;
   }
   double p_word = (1.0 + c_word) / (totalTokens + totalWordTypes);
   double p_tag_given_word = c_tag_and_word / c_word;
   return p_tag_given_word / p_tag * p_word;
 }
Esempio n. 4
0
 /**
  * Builds a Trellis over a sentence, by starting at the state State, and advancing through all
  * legal extensions of each state already in the trellis. You should not have to modify this
  * code (or even read it, really).
  */
 private Trellis<State> buildTrellis(List<String> sentence) {
   Trellis<State> trellis = new Trellis<State>();
   trellis.setStartState(State.getStartState());
   State stopState = State.getStopState(sentence.size() + 2);
   trellis.setStopState(stopState);
   Set<State> states = Collections.singleton(State.getStartState());
   for (int position = 0; position <= sentence.size() + 1; position++) {
     Set<State> nextStates = new HashSet<State>();
     for (State state : states) {
       if (state.equals(stopState)) continue;
       LocalTrigramContext localTrigramContext =
           new LocalTrigramContext(
               sentence, position, state.getPreviousPreviousTag(), state.getPreviousTag());
       Counter<String> tagScores = localTrigramScorer.getLogScoreCounter(localTrigramContext);
       for (String tag : tagScores.keySet()) {
         double score = tagScores.getCount(tag);
         State nextState = state.getNextState(tag);
         trellis.setTransitionCount(state, nextState, score);
         nextStates.add(nextState);
       }
     }
     //        System.out.println("States: "+nextStates);
     states = nextStates;
   }
   return trellis;
 }
	  public Alignment alignSentencePair(SentencePair sentencePair) {
		  Alignment alignment = new Alignment();
	      List<String> frenchWords = sentencePair.getFrenchWords();
	      List<String> englishWords = sentencePair.getEnglishWords();     
	      int numFrenchWords = frenchWords.size();
	      int numEnglishWords = englishWords.size();
	      
	      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
	    	  String f = frenchWords.get(frenchPosition);
	    	  int englishMaxPosition = frenchPosition;
	    	  if (englishMaxPosition >= numEnglishWords)
	    		  englishMaxPosition = -1; // map French word to BASELINE if c(f,e) = 0 for all English words
	    	  double maxConditionalProb = 0;
	    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
	    		  String e = englishWords.get(englishPosition);
	    		  double conditionalGivenEnglish = collocationCounts.getCount(f, e) / (eCounts.getCount(e));
	    		  if (conditionalGivenEnglish > maxConditionalProb) {
	    			  maxConditionalProb = conditionalGivenEnglish;
	    			  englishMaxPosition = englishPosition;
	    		  }
	    	  }	
	    	  alignment.addAlignment(englishMaxPosition, frenchPosition, true);
	      }
		  return alignment;
	  }
Esempio n. 6
0
  private double katzEstimate(Counter<Integer> cnt, double c, double[] coeffs) {
    double nC = cnt.getCount((int) c);
    double nC1 = cnt.getCount(((int) c) + 1);
    if (nC1 == 0.0) nC1 = Math.exp(coeffs[0] + (coeffs[1] * (c + 1.0)));

    double n1 = cnt.getCount(1);
    double nK1 = cnt.getCount(((int) kCutoff) + 1);
    if (nK1 == 0.0) nK1 = Math.exp(coeffs[0] + (coeffs[1] * (kCutoff + 1.0)));

    double kTerm = (kCutoff + 1.0) * (nK1 / n1);
    double cTerm = (c + 1.0) * (nC1 / nC);

    double cSmooth = (cTerm - (c * kTerm)) / (1.0 - kTerm);

    return cSmooth;
  }
Esempio n. 7
0
 public double getCount(K token) {
   if (!lm.keySet().contains(token)) {
     System.err.println(lm.keySet().size());
     throw new RuntimeException("token not in keyset");
   }
   return lm.getCount(token);
 }
 /* A builds PCFG using the observed counts of binary and unary
  * productions in the training trees to estimate the probabilities
  * for those rules.
  */
 public Grammar(List<Tree<String>> trainTrees) {
   Counter<UnaryRule> unaryRuleCounter = new Counter<UnaryRule>();
   Counter<BinaryRule> binaryRuleCounter = new Counter<BinaryRule>();
   Counter<String> symbolCounter = new Counter<String>();
   for (Tree<String> trainTree : trainTrees) {
     tallyTree(trainTree, symbolCounter, unaryRuleCounter, binaryRuleCounter);
   }
   for (UnaryRule unaryRule : unaryRuleCounter.keySet()) {
     double unaryProbability =
         unaryRuleCounter.getCount(unaryRule) / symbolCounter.getCount(unaryRule.getParent());
     unaryRule.setScore(unaryProbability);
     addUnary(unaryRule);
   }
   for (BinaryRule binaryRule : binaryRuleCounter.keySet()) {
     double binaryProbability =
         binaryRuleCounter.getCount(binaryRule) / symbolCounter.getCount(binaryRule.getParent());
     binaryRule.setScore(binaryProbability);
     addBinary(binaryRule);
   }
 }
Esempio n. 9
0
  private double[] runLogSpaceRegression(Counter<Integer> cntCounter) {
    SimpleRegression reg = new SimpleRegression();

    for (int cnt : cntCounter.keySet()) {
      reg.addData(cnt, Math.log(cntCounter.getCount(cnt)));
    }

    // System.out.println(reg.getIntercept());
    // System.out.println(reg.getSlope());
    // System.out.println(regression.getSlopeStdErr());

    double[] coeffs = new double[] {reg.getIntercept(), reg.getSlope()};

    return coeffs;
  }
Esempio n. 10
0
 /**
  * Scores a tagging for a sentence. Note that a tag sequence not accepted by the markov process
  * should receive a log score of Double.NEGATIVE_INFINITY.
  */
 public double scoreTagging(TaggedSentence taggedSentence) {
   double logScore = 0.0;
   List<LabeledLocalTrigramContext> labeledLocalTrigramContexts =
       extractLabeledLocalTrigramContexts(taggedSentence);
   for (LabeledLocalTrigramContext labeledLocalTrigramContext : labeledLocalTrigramContexts) {
     Counter<String> logScoreCounter =
         localTrigramScorer.getLogScoreCounter(labeledLocalTrigramContext);
     String currentTag = labeledLocalTrigramContext.getCurrentTag();
     if (logScoreCounter.containsKey(currentTag)) {
       logScore += logScoreCounter.getCount(currentTag);
     } else {
       logScore += Double.NEGATIVE_INFINITY;
     }
   }
   return logScore;
 }
Esempio n. 11
0
 public Counter<String> getLogScoreCounter(LocalTrigramContext localTrigramContext) {
   int position = localTrigramContext.getPosition();
   String word = localTrigramContext.getWords().get(position);
   Counter<String> tagCounter = unknownWordTags;
   if (wordsToTags.keySet().contains(word)) {
     tagCounter = wordsToTags.getCounter(word);
   }
   Set<String> allowedFollowingTags =
       allowedFollowingTags(
           tagCounter.keySet(),
           localTrigramContext.getPreviousPreviousTag(),
           localTrigramContext.getPreviousTag());
   Counter<String> logScoreCounter = new Counter<String>();
   for (String tag : tagCounter.keySet()) {
     double logScore = Math.log(tagCounter.getCount(tag));
     if (!restrictTrigrams
         || allowedFollowingTags.isEmpty()
         || allowedFollowingTags.contains(tag)) logScoreCounter.setCount(tag, logScore);
   }
   return logScoreCounter;
 }
  private Duple<CrownOperations.Reason, ISynset> getEstimatedSynonym(
      String targetLemma, Set<String> synonyms, POS pos, String gloss) {

    Counter<ISynset> synsetCounts = new ObjectCounter<ISynset>();

    List<String> lemmasInWn = new ArrayList<String>();
    for (String lemma : synonyms) {
      // Get the WordNet sysnet if it exists
      Set<ISynset> senses = WordNetUtils.getSynsets(dict, lemma, pos);
      if (senses.isEmpty()) continue;

      lemmasInWn.add(lemma);
      synsetCounts.countAll(senses);

      // Get the hypernyms of the synset and count their occurrence too
      for (ISynset synset : senses) {
        // Do a sanity check that avoids attaching this Entry if its
        // lemma appears anywhere near the synonoyms.  This check
        // potentially has some false positives since we might avoid
        // putting the lemma somewhere valid (in which case it would
        // have more than would valid location) but is used to avoid
        // noisy integration
        if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, synset)) {
          return null;
        }

        for (ISynsetID hyper : synset.getRelatedSynsets(Pointer.HYPERNYM)) {
          ISynset hyperSyn = dict.getSynset(hyper);
          if (WordNetUtils.isAlreadyInWordNet(dict, targetLemma, pos, hyperSyn)) {
            return null;
          }
          synsetCounts.count(hyperSyn);
        }
      }
    }

    // Return null if we couldn't find any of the lemma's synonyms or
    // hyponyms in WordNet
    if (synsetCounts.items().isEmpty()) return null;

    // If there was only one lemma in this list in WordNet, try comparing
    // the glosses for just that word to find a match
    if (lemmasInWn.size() == 1) {
      double maxScore = 0;
      ISynset best = null;
      String bestGloss = null;
      Set<ISynset> candidateSynonymSynsets = WordNetUtils.getSynsets(dict, lemmasInWn.get(0), pos);
      for (ISynset candidate : candidateSynonymSynsets) {

        String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate);
        double score = simFunc.compare(gloss, wnExtendedGloss);
        if (maxScore < score) {
          maxScore = score;
          best = candidate;
          bestGloss = wnExtendedGloss;
        }
      }

      CrownOperations.Reason r = new CrownOperations.Reason(getClass());
      r.set("relation_type", "synonym");
      r.set("heuristic", "single-synonym");
      r.set("max_score", maxScore);
      return new Duple<CrownOperations.Reason, ISynset>(r, best);
    } else {
      // Check for whether there were ties in the max
      ISynset mostFreq = synsetCounts.max();
      int mostFreqCount = synsetCounts.getCount(mostFreq);
      List<ISynset> ties = new ArrayList<ISynset>();
      for (ISynset syn : synsetCounts.items()) {
        int c = synsetCounts.getCount(syn);
        if (c == mostFreqCount) ties.add(syn);
      }

      // If there was only one synset that had the maximum count, then we
      // report this
      if (ties.size() == 1) {

        CrownOperations.Reason r = new CrownOperations.Reason(getClass());
        r.set("relation_type", "synonym");
        r.set("heuristic", "unambiguous-max");
        r.set("count", mostFreqCount);
        return new Duple<CrownOperations.Reason, ISynset>(r, mostFreq);
      }
      // Otherwise, we try breaking ties between the synsets using gloss
      // similarity
      else {

        double maxScore = 0;
        ISynset best = null;
        String bestGloss = null;
        for (ISynset candidate : ties) {
          String wnExtendedGloss = WordNetUtils.getGlossWithoutExamples(candidate);
          double score = simFunc.compare(gloss, wnExtendedGloss);
          if (maxScore < score) {
            maxScore = score;
            best = candidate;
            bestGloss = wnExtendedGloss;
          }
        }

        CrownOperations.Reason r = new CrownOperations.Reason(getClass());
        r.set("relation_type", "synonym");
        r.set("heuristic", "tied-synonyms");
        r.set("max_score", maxScore);
        return new Duple<CrownOperations.Reason, ISynset>(r, best);
      }
    }
  }
	  private CounterMap<String,String> trainEM(int maxIterations) {
		  Set<String> englishVocab = new HashSet<String>();
		  Set<String> frenchVocab = new HashSet<String>();
		  
		  CounterMap<String,String> translations = new CounterMap<String,String>();
		  englishVocab.add(NULL);
		  int iteration = 0;
		  final double thresholdProb = 0.0001;
		  
		  for (SentencePair sentencePair : trainingSentencePairs) {
			  List<String> frenchWords = sentencePair.getFrenchWords();
			  List<String> englishWords = sentencePair.getEnglishWords();
			  // add words from list to vocabulary sets
			  englishVocab.addAll(englishWords);
			  frenchVocab.addAll(frenchWords);
		  }
		  System.out.println("Ready");
		  
		  // We need to initialize translations.getCount(f,e) uniformly
		  // t(f|e) summed over all e in {E + NULL} = 1
		  final double initialCount = 1.0 / englishVocab.size();
		  
		  while(iteration < maxIterations) {
			  CounterMap<String,String> counts = new CounterMap<String,String>(); // set count(f|e) to 0 for all e,f
			  Counter<String> totalEnglish = new Counter<String>(); // set total(e) to 0 for all e
			  
			  // E-step: loop over all sentences and update counts
			  for (SentencePair sentencePair : trainingSentencePairs) {
				  List<String> frenchWords = sentencePair.getFrenchWords();
				  List<String> englishWords = sentencePair.getEnglishWords();
				  
			      int numFrenchWords = frenchWords.size();
			      int numEnglishWords = englishWords.size();
			      Counter<String> sTotalF = new Counter<String>(); 
			      
			      // compute normalization constant sTotalF
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  // initialize and compute for English = NULL
			    	  if (!translations.containsKey(f) && initialize)
			    		  translations.setCount(f, NULL, initialCount);
			    	  else if (!translations.containsKey(f))
			    		  translations.setCount(f, NULL, thresholdProb);
			    	  sTotalF.incrementCount(f, translations.getCount(f, NULL)); 
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  if (!(translations.getCounter(f)).containsKey(e) && initialize)
			    			  translations.setCount(f, e, initialCount);
			    		  else if (!(translations.getCounter(f)).containsKey(e))
			    			  translations.setCount(f, e, thresholdProb);
			    		  sTotalF.incrementCount(f, translations.getCount(f, e));
			    	  }
			      }
			      
			      // collect counts in counts and totalEnglish
			      for (int frenchPosition = 0; frenchPosition < numFrenchWords; frenchPosition++) {
			    	  String f = frenchWords.get(frenchPosition);
			    	  
			    	  // collect counts for English = NULL
			    	  double count = translations.getCount(f, NULL) / sTotalF.getCount(f);
			    	  counts.incrementCount(NULL, f, count);
			    	  totalEnglish.incrementCount(NULL, count);
			    	  for (int englishPosition = 0; englishPosition < numEnglishWords; englishPosition++) {
			    		  String e = englishWords.get(englishPosition);
			    		  count = translations.getCount(f, e) / sTotalF.getCount(f);
			    		  counts.incrementCount(e, f, count);
			    		  totalEnglish.incrementCount(e, count);
			    	  }
			      }
			  } // end of E-step
			  System.out.println("Completed E-step");
			  
			  // M-step: update probabilities with counts from E-step and check for convergence
			  iteration++;
			  for (String e : counts.keySet()) {//englishVocab) {
				  double normalizer = totalEnglish.getCount(e);
				  for (String f : (counts.getCounter(e)).keySet()) {//frenchVocab) {
					  
					  // To speed implementation, we want to update translations only when count / normalizer > threshold
					  double prob = counts.getCount(e, f) / normalizer;
					  if (!initialize) {					  
						  if (prob > thresholdProb)
							  translations.setCount(f, e, prob);
						  else
							  (translations.getCounter(f)).removeKey(e);
					  }
					  else {
						  translations.setCount(f, e, prob);
					  }
				  }
			  }
			  System.out.println("Completed iteration " + iteration);
		  } // end of M-step
		  
		  System.out.println("Trained!");
		  return translations;
	  }
Esempio n. 14
0
 public double getProb(K token) {
   if (unkTokens.contains(token) || !lm.containsKey(token)) return UNK_PROB;
   return lm.getCount(token);
 }