Java ClassicCounter.getCount 예제들, edu.stanford.nlp.stats.ClassicCounter.getCount Java 예제들

예제 #1

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  /**
   * Return the probability (as a real number between 0 and 1) of stopping rather than generating
   * another argument at this position.
   *
   * @param dependency The dependency used as the basis for stopping on. Tags are assumed to be in
   *     the TagProjection space.
   * @return The probability of generating this stop probability
   */
  protected double getStopProb(IntDependency dependency) {
    short binDistance = distanceBin(dependency.distance);
    IntTaggedWord unknownHead = new IntTaggedWord(-1, dependency.head.tag);
    IntTaggedWord anyHead = new IntTaggedWord(ANY_WORD_INT, dependency.head.tag);

    IntDependency temp =
        new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance);
    double c_stop_hTWds = stopCounter.getCount(temp);
    temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance);
    double c_stop_hTds = stopCounter.getCount(temp);
    temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance);
    double c_hTWds = stopCounter.getCount(temp);
    temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance);
    double c_hTds = stopCounter.getCount(temp);

    double p_stop_hTds = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0);

    double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop);

    if (verbose) {
      System.out.println(
          "  c_stop_hTWds: "
              + c_stop_hTWds
              + "; c_hTWds: "
              + c_hTWds
              + "; c_stop_hTds: "
              + c_stop_hTds
              + "; c_hTds: "
              + c_hTds);
      System.out.println("  Generate STOP prob: " + pb_stop_hTWds);
    }
    return pb_stop_hTWds;
  }

예제 #2

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
    stream.defaultReadObject();
    //    System.err.println("Before decompression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    ClassicCounter<IntDependency> compressedArgC = argCounter;
    argCounter = new ClassicCounter<IntDependency>();
    ClassicCounter<IntDependency> compressedStopC = stopCounter;
    stopCounter = new ClassicCounter<IntDependency>();
    for (IntDependency d : compressedArgC.keySet()) {
      double count = compressedArgC.getCount(d);
      expandArg(d, d.distance, count);
    }

    for (IntDependency d : compressedStopC.keySet()) {
      double count = compressedStopC.getCount(d);
      expandStop(d, d.distance, count, false);
    }

    //    System.err.println("After decompression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    expandDependencyMap = null;
  }

예제 #3

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  /** Writes out data from this Object to the Writer w. */
  @Override
  public void writeData(PrintWriter out) throws IOException {
    // all lines have one rule per line

    for (IntDependency dependency : argCounter.keySet()) {
      if (dependency.head != wildTW
          && dependency.arg != wildTW
          && dependency.head.word != -1
          && dependency.arg.word != -1) {
        double count = argCounter.getCount(dependency);
        out.println(dependency.toString(wordIndex, tagIndex) + " " + count);
      }
    }

    out.println("BEGIN_STOP");

    for (IntDependency dependency : stopCounter.keySet()) {
      if (dependency.head.word != -1) {
        double count = stopCounter.getCount(dependency);
        out.println(dependency.toString(wordIndex, tagIndex) + " " + count);
      }
    }

    out.flush();
  }

예제 #4

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

  /**
   * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon.
   * First arg is the collection of trees; second through fourth args get the results. Currently
   * unused; this probably only works if train and test at same time so tags and words variables are
   * initialized.
   */
  public double evaluateCoverage(
      Collection<Tree> trees,
      Set<String> missingWords,
      Set<String> missingTags,
      Set<IntTaggedWord> missingTW) {

    List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>();
    for (Tree t : trees) {
      iTW1.addAll(treeToEvents(t));
    }

    int total = 0;
    int unseen = 0;

    for (IntTaggedWord itw : iTW1) {
      total++;
      if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) {
        missingWords.add(wordIndex.get(itw.word()));
      }
      if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) {
        missingTags.add(tagIndex.get(itw.tag()));
      }
      // if (!rules.contains(itw)) {
      if (seenCounter.getCount(itw) == 0.0) {
        unseen++;
        missingTW.add(itw);
      }
    }
    return (double) unseen / total;
  }

예제 #5

0

파일 보기

파일: AbstractEval.java 프로젝트: alishir/CoreNLP

 private static <T> void display(ClassicCounter<T> c, PrintWriter pw) {
   List<T> cats = new ArrayList<>(c.keySet());
   Collections.sort(cats, Counters.toComparatorDescending(c));
   for (T ob : cats) {
     pw.println(ob + " " + c.getCount(ob));
   }
 }

예제 #6

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

 /**
  * This records how likely it is for a word with one tag to also have another tag. This won't work
  * after serialization/deserialization, but that is how it is currently called....
  */
 void buildPT_T() {
   int numTags = tagIndex.size();
   m_TT = new double[numTags][numTags];
   m_T = new double[numTags];
   double[] tmp = new double[numTags];
   for (IntTaggedWord word : words) {
     double tot = 0.0;
     for (int t = 0; t < numTags; t++) {
       IntTaggedWord iTW = new IntTaggedWord(word.word, t);
       tmp[t] = seenCounter.getCount(iTW);
       tot += tmp[t];
     }
     if (tot < 10) {
       continue;
     }
     for (int t = 0; t < numTags; t++) {
       for (int t2 = 0; t2 < numTags; t2++) {
         if (tmp[t2] > 0.0) {
           double c = tmp[t] / tot;
           m_T[t] += c;
           m_TT[t2][t] += c;
         }
       }
     }
   }
 }

예제 #7

0

파일 보기

파일: AbstractEval.java 프로젝트: alishir/CoreNLP

 private static <T> void display(ClassicCounter<T> c, int num, PrintWriter pw) {
   List<T> rules = new ArrayList<>(c.keySet());
   Collections.sort(rules, Counters.toComparatorDescending(c));
   int rSize = rules.size();
   if (num > rSize) {
     num = rSize;
   }
   for (int i = 0; i < num; i++) {
     pw.println(rules.get(i) + " " + c.getCount(rules.get(i)));
   }
 }

예제 #8

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  public double countHistory(IntDependency dependency) {
    IntDependency temp =
        new IntDependency(
            dependency.head.word,
            tagBin(dependency.head.tag),
            wildTW.word,
            wildTW.tag,
            dependency.leftHeaded,
            valenceBin(dependency.distance));

    return argCounter.getCount(temp);
  }

예제 #9

0

파일 보기

파일: FeatureValues.java 프로젝트: kasooja/translation

 /** @param <T> */
 public static <T> List<FeatureValue<T>> combine(Collection<FeatureValue<T>> featureValues) {
   ClassicCounter<T> counter = new ClassicCounter<T>();
   for (FeatureValue<T> fv : featureValues) {
     counter.incrementCount(fv.name, fv.value);
   }
   Set<T> keys = new TreeSet<T>(counter.keySet());
   List<FeatureValue<T>> featureList = new ArrayList<FeatureValue<T>>(keys.size());
   for (T key : keys) {
     featureList.add(new FeatureValue<T>(key, counter.getCount(key)));
   }
   return featureList;
 }

예제 #10

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

 /**
  * Generate the possible taggings for a word at a sentence position. This may either be based on a
  * strict lexicon or an expanded generous set of possible taggings.
  *
  * <p><i>Implementation note:</i> Expanded sets of possible taggings are calculated dynamically at
  * runtime, so as to reduce the memory used by the lexicon (a space/time tradeoff).
  *
  * @param word The word (as an int)
  * @param loc Its index in the sentence (usually only relevant for unknown words)
  * @return A list of possible taggings
  */
 public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) {
   // if (rulesWithWord == null) { // tested in isKnown already
   // initRulesWithWord();
   // }
   List<IntTaggedWord> wordTaggings;
   if (isKnown(word)) {
     if (!flexiTag) {
       // Strict lexical tagging for seen items
       wordTaggings = rulesWithWord[word];
     } else {
       /* Allow all tags with same basicCategory */
       /* Allow all scored taggings, unless very common */
       IntTaggedWord iW = new IntTaggedWord(word, nullTag);
       if (seenCounter.getCount(iW) > smoothInUnknownsThreshold) {
         return rulesWithWord[word].iterator();
       } else {
         // give it flexible tagging not just lexicon
         wordTaggings = new ArrayList<IntTaggedWord>(40);
         for (IntTaggedWord iTW2 : tags) {
           IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag);
           if (score(iTW, loc, wordIndex.get(word)) > Float.NEGATIVE_INFINITY) {
             wordTaggings.add(iTW);
           }
         }
       }
     }
   } else {
     // we copy list so we can insert correct word in each item
     wordTaggings = new ArrayList<IntTaggedWord>(40);
     for (IntTaggedWord iTW : rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)]) {
       wordTaggings.add(new IntTaggedWord(word, iTW.tag));
     }
   }
   if (DEBUG_LEXICON) {
     EncodingPrintWriter.err.println(
         "Lexicon: "
             + wordIndex.get(word)
             + " ("
             + (isKnown(word) ? "known" : "unknown")
             + ", loc="
             + loc
             + ", n="
             + (isKnown(word) ? word : wordIndex.indexOf(UNKNOWN_WORD))
             + ") "
             + (flexiTag ? "flexi" : "lexicon")
             + " taggings: "
             + wordTaggings,
         "UTF-8");
   }
   return wordTaggings.iterator();
 }

예제 #11

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  private void writeObject(ObjectOutputStream stream) throws IOException {
    //    System.err.println("\nBefore compression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    ClassicCounter<IntDependency> fullArgCounter = argCounter;
    argCounter = new ClassicCounter<IntDependency>();
    for (IntDependency dependency : fullArgCounter.keySet()) {
      if (dependency.head != wildTW
          && dependency.arg != wildTW
          && dependency.head.word != -1
          && dependency.arg.word != -1) {
        argCounter.incrementCount(dependency, fullArgCounter.getCount(dependency));
      }
    }

    ClassicCounter<IntDependency> fullStopCounter = stopCounter;
    stopCounter = new ClassicCounter<IntDependency>();
    for (IntDependency dependency : fullStopCounter.keySet()) {
      if (dependency.head.word != -1) {
        stopCounter.incrementCount(dependency, fullStopCounter.getCount(dependency));
      }
    }

    //    System.err.println("After compression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    stream.defaultWriteObject();

    argCounter = fullArgCounter;
    stopCounter = fullStopCounter;
  }

예제 #12

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

  /**
   * Writes out data from this Object to the Writer w. Rules are separated by newline, and rule
   * elements are delimited by \t.
   */
  public void writeData(Writer w) throws IOException {
    PrintWriter out = new PrintWriter(w);

    for (IntTaggedWord itw : seenCounter.keySet()) {
      out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " SEEN " + seenCounter.getCount(itw));
    }
    for (IntTaggedWord itw : getUnknownWordModel().unSeenCounter().keySet()) {
      out.println(
          itw.toLexicalEntry(wordIndex, tagIndex)
              + " UNSEEN "
              + getUnknownWordModel().unSeenCounter().getCount(itw));
    }
    for (int i = 0; i < smooth.length; i++) {
      out.println("smooth[" + i + "] = " + smooth[i]);
    }
    out.flush();
  }

예제 #13

0

파일 보기

파일: MLEDependencyGrammarExtractor.java 프로젝트: Balkanlii/nlp

 @Override
 public DependencyGrammar formResult() {
   wordIndex.indexOf(Lexicon.UNKNOWN_WORD, true);
   MLEDependencyGrammar dg =
       new MLEDependencyGrammar(
           tlpParams,
           directional,
           useDistance,
           useCoarseDistance,
           basicCategoryTagsInDependencyGrammar,
           op,
           wordIndex,
           tagIndex);
   for (IntDependency dependency : dependencyCounter.keySet()) {
     dg.addRule(dependency, dependencyCounter.getCount(dependency));
   }
   return dg;
 }

예제 #14

0

파일 보기

파일: EnglishUnknownWordModelTrainer.java 프로젝트: alishir/CoreNLP

  /** Trains this UWM on the Collection of trees. */
  public void train(TaggedWord tw, int loc, double weight) {
    IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
    IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
    IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
    seenCounter.incrementCount(iW, weight);
    IntTaggedWord i = NULL_ITW;

    if (treesRead > indexToStartUnkCounting) {
      // start doing this once some way through trees;
      // treesRead is 1 based counting
      if (seenCounter.getCount(iW) < 1.5) {
        // it's an entirely unknown word
        int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word));
        if (DOCUMENT_UNKNOWNS) {
          String wStr = wordIndex.get(iTW.word);
          String tStr = tagIndex.get(iTW.tag);
          String sStr = wordIndex.get(s);
          EncodingPrintWriter.err.println(
              "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8");
        }
        IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
        IntTaggedWord iS = new IntTaggedWord(s, nullTag);
        unSeenCounter.incrementCount(iTS, weight);
        unSeenCounter.incrementCount(iT, weight);
        unSeenCounter.incrementCount(iS, weight);
        unSeenCounter.incrementCount(i, weight);
        // rules.add(iTS);
        // sigs.add(iS);
      } // else {
      // if (seenCounter.getCount(iTW) < 2) {
      // it's a new tag for a known word
      // do nothing for now
      // }
      // }
    }
  }

예제 #15

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

 /**
  * Checks whether a word is in the lexicon. This version works even while compiling lexicon with
  * current counters (rather than using the compiled rulesWithWord array).
  *
  * <p>TODO: The previous version would insert rules into the wordNumberer. Is that the desired
  * behavior? Why not test in some way that doesn't affect the index? For example, start by testing
  * wordIndex.contains(word).
  *
  * @param word The word as a String
  * @return Whether the word is in the lexicon
  */
 public boolean isKnown(String word) {
   if (!wordIndex.contains(word)) return false;
   IntTaggedWord iW = new IntTaggedWord(wordIndex.indexOf(word), nullTag);
   return seenCounter.getCount(iW) > 0.0;
 }

예제 #16

0

파일 보기

파일: BaseLexicon.java 프로젝트: rodolfopc/ufmg-nlp

  /**
   * Get the score of this word with this tag (as an IntTaggedWord) at this location. (Presumably an
   * estimate of P(word | tag).)
   *
   * <p><i>Implementation documentation:</i> Seen: c_W = count(W) c_TW = count(T,W) c_T = count(T)
   * c_Tunseen = count(T) among new words in 2nd half total = count(seen words) totalUnseen =
   * count("unseen" words) p_T_U = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W &gt;
   * smoothInUnknownsThreshold) = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior
   * smooth[1] with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule]
   * Note that this doesn't really properly reserve mass to unknowns.
   *
   * <p>Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) c_U = totalUnseen
   * above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]]
   * pb_W_T = log(P(W|T)) inverted
   *
   * @param iTW An IntTaggedWord pairing a word and POS tag
   * @param loc The position in the sentence. <i>In the default implementation this is used only for
   *     unknown words to change their probability distribution when sentence initial</i>
   * @return A float score, usually, log P(word|tag)
   */
  public float score(IntTaggedWord iTW, int loc, String word) {
    // both actual
    double c_TW = seenCounter.getCount(iTW);
    // double x_TW = xferCounter.getCount(iTW);

    IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag);
    // word counts
    double c_W = seenCounter.getCount(temp);
    // double x_W = xferCounter.getCount(temp);

    // totals
    double total = seenCounter.getCount(NULL_ITW);
    double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW);

    temp = new IntTaggedWord(nullWord, iTW.tag);
    // tag counts
    double c_T = seenCounter.getCount(temp);
    double c_Tunseen = uwModel.unSeenCounter().getCount(temp);

    double pb_W_T; // always set below

    if (DEBUG_LEXICON) {
      // dump info about last word
      if (iTW.word != debugLastWord) {
        if (debugLastWord >= 0 && debugPrefix != null) {
          // the 2nd conjunct in test above handles older serialized files
          EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8");
        }
      }
    }

    boolean seen = (c_W > 0.0);

    if (seen) {

      // known word model for P(T|W)
      if (DEBUG_LEXICON_SCORE) {
        System.err.println(
            "Lexicon.score "
                + wordIndex.get(iTW.word)
                + "/"
                + tagIndex.get(iTW.tag)
                + " as known word.");
      }

      // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this]
      // c_TW += 0.5;

      double p_T_U;
      if (useSignatureForKnownSmoothing) { // only works for English currently
        p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word);
        if (DEBUG_LEXICON_SCORE)
          System.err.println(
              "With useSignatureForKnownSmoothing, P(T|U) is "
                  + p_T_U
                  + " rather than "
                  + (c_Tunseen / totalUnseen));
      } else {
        p_T_U = c_Tunseen / totalUnseen;
      }
      double pb_T_W; // always set below

      if (DEBUG_LEXICON_SCORE) {
        System.err.println(
            "c_W is "
                + c_W
                + " mle = "
                + (c_TW / c_W)
                + " smoothInUnknownsThresh is "
                + smoothInUnknownsThreshold
                + " base p_T_U is "
                + c_Tunseen
                + "/"
                + totalUnseen
                + " = "
                + p_T_U);
      }
      if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) {
        // we've seen the word enough times to have confidence in its tagging
        pb_T_W = c_TW / c_W;
      } else {
        // we haven't seen the word enough times to have confidence in its
        // tagging
        if (smartMutation) {
          int numTags = tagIndex.size();
          if (m_TT == null || numTags != m_T.length) {
            buildPT_T();
          }
          p_T_U *= 0.1;
          // System.out.println("Checking "+iTW);
          for (int t = 0; t < numTags; t++) {
            IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t);
            double p_T_W2 = seenCounter.getCount(iTW2) / c_W;
            if (p_T_W2 > 0) {
              // System.out.println(" Observation of "+tagIndex.get(t)+"
              // ("+seenCounter.getCount(iTW2)+") mutated to
              // "+tagIndex.get(iTW.tag)+" at rate
              // "+(m_TT[tag][t]/m_T[t]));
              p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9;
            }
          }
        }
        if (DEBUG_LEXICON_SCORE) {
          System.err.println("c_TW = " + c_TW + " c_W = " + c_W + " p_T_U = " + p_T_U);
        }
        // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W);
        pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
      }
      double p_T = (c_T / total);
      double p_W = (c_W / total);
      pb_W_T = Math.log(pb_T_W * p_W / p_T);

      if (DEBUG_LEXICON) {
        if (iTW.word != debugLastWord) {
          debugLastWord = iTW.word;
          debugLoc = loc;
          debugProbs = new StringBuilder();
          debugNoProbs = new StringBuilder("impossible: ");
          debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): ";
        }
        if (pb_W_T > Double.NEGATIVE_INFINITY) {
          NumberFormat nf = NumberFormat.getNumberInstance();
          nf.setMaximumFractionDigits(3);
          debugProbs.append(
              tagIndex.get(iTW.tag)
                  + ": cTW="
                  + c_TW
                  + " c_T="
                  + c_T
                  + " pb_T_W="
                  + nf.format(pb_T_W)
                  + " log pb_W_T="
                  + nf.format(pb_W_T)
                  + ", ");
          // debugProbs.append("\n" + "smartMutation=" + smartMutation + "
          // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + "
          // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U
          // + " c_W=" + c_W);
        } else {
          debugNoProbs.append(tagIndex.get(iTW.tag)).append(' ');
        }
      } // end if (DEBUG_LEXICON)

    } else { // when unseen
      if (loc >= 0) {
        pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word);
      } else {
        // For negative we now do a weighted average for the dependency grammar :-)
        double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word);
        double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word);
        pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T)) / 3);
      }
    }

    // Categorical cutoff if score is too low
    if (pb_W_T > -100.0) {
      return (float) pb_W_T;
    }
    return Float.NEGATIVE_INFINITY;
  } // end score()

예제 #17

0

파일 보기

파일: MLEDependencyGrammar.java 프로젝트: rodolfopc/ufmg-nlp

  /**
   * Calculate the probability of a dependency as a real probability between 0 and 1 inclusive.
   *
   * @param dependency The dependency for which the probability is to be calculated. The tags in
   *     this dependency are in the reduced TagProjection space.
   * @return The probability of the dependency
   */
  protected double probTB(IntDependency dependency) {
    if (verbose) {
      // System.out.println("tagIndex: " + tagIndex);
      System.err.println("Generating " + dependency);
    }

    boolean leftHeaded = dependency.leftHeaded && directional;

    int hW = dependency.head.word;
    int aW = dependency.arg.word;
    short hT = dependency.head.tag;
    short aT = dependency.arg.tag;

    IntTaggedWord aTW = dependency.arg;
    IntTaggedWord hTW = dependency.head;

    boolean isRoot = rootTW(dependency.head);
    double pb_stop_hTWds;
    if (isRoot) {
      pb_stop_hTWds = 0.0;
    } else {
      pb_stop_hTWds = getStopProb(dependency);
    }

    if (dependency.arg.word == STOP_WORD_INT) {
      // did we generate stop?
      return pb_stop_hTWds;
    }

    double pb_go_hTWds = 1.0 - pb_stop_hTWds;

    // generate the argument

    short binDistance = valenceBin(dependency.distance);

    // KEY:
    // c_     count of (read as joint count of first and second)
    // p_     MLE prob of (or MAP if useSmoothTagProjection)
    // pb_    MAP prob of (read as prob of first given second thing)
    // a      arg
    // h      head
    // T      tag
    // PT     projected tag
    // W      word
    // d      direction
    // ds     distance (implicit: there when direction is mentioned!)

    IntTaggedWord anyHead = new IntTaggedWord(ANY_WORD_INT, dependency.head.tag);
    IntTaggedWord anyArg = new IntTaggedWord(ANY_WORD_INT, dependency.arg.tag);
    IntTaggedWord anyTagArg = new IntTaggedWord(dependency.arg.word, ANY_TAG_INT);

    IntDependency temp =
        new IntDependency(dependency.head, dependency.arg, leftHeaded, binDistance);
    double c_aTW_hTWd = argCounter.getCount(temp);
    temp = new IntDependency(dependency.head, anyArg, leftHeaded, binDistance);
    double c_aT_hTWd = argCounter.getCount(temp);
    temp = new IntDependency(dependency.head, wildTW, leftHeaded, binDistance);
    double c_hTWd = argCounter.getCount(temp);

    temp = new IntDependency(anyHead, dependency.arg, leftHeaded, binDistance);
    double c_aTW_hTd = argCounter.getCount(temp);
    temp = new IntDependency(anyHead, anyArg, leftHeaded, binDistance);
    double c_aT_hTd = argCounter.getCount(temp);
    temp = new IntDependency(anyHead, wildTW, leftHeaded, binDistance);
    double c_hTd = argCounter.getCount(temp);

    // for smooth tag projection
    short aPT = Short.MIN_VALUE;
    double c_aPTW_hPTd = Double.NaN;
    double c_aPT_hPTd = Double.NaN;
    double c_hPTd = Double.NaN;
    double c_aPTW_aPT = Double.NaN;
    double c_aPT = Double.NaN;

    if (useSmoothTagProjection) {
      aPT = tagProject(dependency.arg.tag);
      short hPT = tagProject(dependency.head.tag);

      IntTaggedWord projectedArg = new IntTaggedWord(dependency.arg.word, aPT);
      IntTaggedWord projectedAnyHead = new IntTaggedWord(ANY_WORD_INT, hPT);
      IntTaggedWord projectedAnyArg = new IntTaggedWord(ANY_WORD_INT, aPT);

      temp = new IntDependency(projectedAnyHead, projectedArg, leftHeaded, binDistance);
      c_aPTW_hPTd = argCounter.getCount(temp);
      temp = new IntDependency(projectedAnyHead, projectedAnyArg, leftHeaded, binDistance);
      c_aPT_hPTd = argCounter.getCount(temp);
      temp = new IntDependency(projectedAnyHead, wildTW, leftHeaded, binDistance);
      c_hPTd = argCounter.getCount(temp);

      temp = new IntDependency(wildTW, projectedArg, false, ANY_DISTANCE_INT);
      c_aPTW_aPT = argCounter.getCount(temp);
      temp = new IntDependency(wildTW, projectedAnyArg, false, ANY_DISTANCE_INT);
      c_aPT = argCounter.getCount(temp);
    }

    // wild head is always directionless and no use distance
    temp = new IntDependency(wildTW, dependency.arg, false, ANY_DISTANCE_INT);
    double c_aTW = argCounter.getCount(temp);
    temp = new IntDependency(wildTW, anyArg, false, ANY_DISTANCE_INT);
    double c_aT = argCounter.getCount(temp);
    temp = new IntDependency(wildTW, anyTagArg, false, ANY_DISTANCE_INT);
    double c_aW = argCounter.getCount(temp);

    // do the Bayesian magic
    // MLE probs
    double p_aTW_hTd;
    double p_aT_hTd;
    double p_aTW_aT;
    double p_aW;
    double p_aPTW_aPT;
    double p_aPTW_hPTd;
    double p_aPT_hPTd;

    // backoffs either mle or themselves bayesian smoothed depending on useSmoothTagProjection
    if (useSmoothTagProjection) {
      if (useUnigramWordSmoothing) {
        p_aW = c_aW > 0.0 ? (c_aW / numWordTokens) : 1.0; // NEED this 1.0 for unknown words!!!
        p_aPTW_aPT = (c_aPTW_aPT + smooth_aPTW_aPT * p_aW) / (c_aPT + smooth_aPTW_aPT);
      } else {
        p_aPTW_aPT =
            c_aPTW_aPT > 0.0 ? (c_aPTW_aPT / c_aPT) : 1.0; // NEED this 1.0 for unknown words!!!
      }
      p_aTW_aT = (c_aTW + smooth_aTW_aT * p_aPTW_aPT) / (c_aT + smooth_aTW_aT);

      p_aPTW_hPTd = c_hPTd > 0.0 ? (c_aPTW_hPTd / c_hPTd) : 0.0;
      p_aTW_hTd = (c_aTW_hTd + smooth_aTW_hTd * p_aPTW_hPTd) / (c_hTd + smooth_aTW_hTd);

      p_aPT_hPTd = c_hPTd > 0.0 ? (c_aPT_hPTd / c_hPTd) : 0.0;
      p_aT_hTd = (c_aT_hTd + smooth_aT_hTd * p_aPT_hPTd) / (c_hTd + smooth_aT_hTd);
    } else {
      // here word generation isn't smoothed - can't get previously unseen word with tag.  Ugh.
      if (op.testOptions.useLexiconToScoreDependencyPwGt) {
        // We don't know the position.  Now -1 means average over 0 and 1.
        p_aTW_aT =
            dependency.leftHeaded
                ? Math.exp(lex.score(dependency.arg, 1, wordIndex.get(dependency.arg.word)))
                : Math.exp(lex.score(dependency.arg, -1, wordIndex.get(dependency.arg.word)));
        // double oldScore = c_aTW > 0.0 ? (c_aTW / c_aT) : 1.0;
        // if (oldScore == 1.0) {
        //  System.err.println("#### arg=" + dependency.arg + " score=" + p_aTW_aT +
        //                      " oldScore=" + oldScore + " c_aTW=" + c_aTW + " c_aW=" + c_aW);
        // }
      } else {
        p_aTW_aT = c_aTW > 0.0 ? (c_aTW / c_aT) : 1.0;
      }
      p_aTW_hTd = c_hTd > 0.0 ? (c_aTW_hTd / c_hTd) : 0.0;
      p_aT_hTd = c_hTd > 0.0 ? (c_aT_hTd / c_hTd) : 0.0;
    }

    double pb_aTW_hTWd = (c_aTW_hTWd + smooth_aTW_hTWd * p_aTW_hTd) / (c_hTWd + smooth_aTW_hTWd);
    double pb_aT_hTWd = (c_aT_hTWd + smooth_aT_hTWd * p_aT_hTd) / (c_hTWd + smooth_aT_hTWd);

    double score = (interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd) * pb_go_hTWds;

    if (verbose) {
      NumberFormat nf = NumberFormat.getNumberInstance();
      nf.setMaximumFractionDigits(2);
      if (useSmoothTagProjection) {
        if (useUnigramWordSmoothing) {
          System.err.println(
              "  c_aW=" + c_aW + ", numWordTokens=" + numWordTokens + ", p(aW)=" + nf.format(p_aW));
        }
        System.err.println(
            "  c_aPTW_aPT="
                + c_aPTW_aPT
                + ", c_aPT="
                + c_aPT
                + ", smooth_aPTW_aPT="
                + smooth_aPTW_aPT
                + ", p(aPTW|aPT)="
                + nf.format(p_aPTW_aPT));
      }
      System.err.println(
          "  c_aTW="
              + c_aTW
              + ", c_aT="
              + c_aT
              + ", smooth_aTW_aT="
              + smooth_aTW_aT
              + ", ## p(aTW|aT)="
              + nf.format(p_aTW_aT));

      if (useSmoothTagProjection) {
        System.err.println(
            "  c_aPTW_hPTd="
                + c_aPTW_hPTd
                + ", c_hPTd="
                + c_hPTd
                + ", p(aPTW|hPTd)="
                + nf.format(p_aPTW_hPTd));
      }
      System.err.println(
          "  c_aTW_hTd="
              + c_aTW_hTd
              + ", c_hTd="
              + c_hTd
              + ", smooth_aTW_hTd="
              + smooth_aTW_hTd
              + ", p(aTW|hTd)="
              + nf.format(p_aTW_hTd));

      if (useSmoothTagProjection) {
        System.err.println(
            "  c_aPT_hPTd="
                + c_aPT_hPTd
                + ", c_hPTd="
                + c_hPTd
                + ", p(aPT|hPTd)="
                + nf.format(p_aPT_hPTd));
      }
      System.err.println(
          "  c_aT_hTd="
              + c_aT_hTd
              + ", c_hTd="
              + c_hTd
              + ", smooth_aT_hTd="
              + smooth_aT_hTd
              + ", p(aT|hTd)="
              + nf.format(p_aT_hTd));

      System.err.println(
          "  c_aTW_hTWd="
              + c_aTW_hTWd
              + ", c_hTWd="
              + c_hTWd
              + ", smooth_aTW_hTWd="
              + smooth_aTW_hTWd
              + ", ## p(aTW|hTWd)="
              + nf.format(pb_aTW_hTWd));
      System.err.println(
          "  c_aT_hTWd="
              + c_aT_hTWd
              + ", c_hTWd="
              + c_hTWd
              + ", smooth_aT_hTWd="
              + smooth_aT_hTWd
              + ", ## p(aT|hTWd)="
              + nf.format(pb_aT_hTWd));

      System.err.println(
          "  interp="
              + interp
              + ", prescore="
              + nf.format(interp * pb_aTW_hTWd + (1.0 - interp) * p_aTW_aT * pb_aT_hTWd)
              + ", P(go|hTWds)="
              + nf.format(pb_go_hTWds)
              + ", score="
              + nf.format(score));
    }

    if (op.testOptions.prunePunc && pruneTW(aTW)) {
      return 1.0;
    }

    if (Double.isNaN(score)) {
      score = 0.0;
    }

    // if (op.testOptions.rightBonus && ! dependency.leftHeaded)
    //  score -= 0.2;

    if (score < MIN_PROBABILITY) {
      score = 0.0;
    }

    return score;
  }