Ejemplo n.º 1
0
  /**
   * Infer new bases using the given lexicon and transforms
   *
   * @param lex the lexicon
   * @param transform the learned transforms
   * @return a Set of inferred words
   */
  private Collection<Word> inferBases(Lexicon lex, Transform transform) {
    // Go over each unmodeled word with affix2 of the transform. If
    // its hypothesized base is not word, infer it
    Set<Word> newWords = new THashSet<Word>();
    for (Word w : transform.getAffix2().getWordSet()) {
      // Skip anything not unmodeled
      if (w.getSet() != WordSet.UNMODELED) {
        continue;
      }

      // Hypothesize the base
      String baseText = Transform.inferBase(w, transform);

      // If the base does not exist, try to infer it
      if (lex.getWord(baseText) == null) {
        // If it was already inferred, add it to the lexicon
        if (inferredBases.contains(baseText)) {
          // Create a new word using the token count of the word
          // that ended up promoting it
          Word newWord = new Word(baseText, w.getCount(), false);
          newWords.add(newWord);
        } else {
          // Otherwise, infer it
          inferredBases.add(baseText);
        }
      }
    }

    return newWords;
  }
Ejemplo n.º 2
0
  /**
   * Infer the bases from the latest transform and then process them.
   *
   * @param lex the lexicon
   * @param learnedTransforms the learned transforms
   * @param hypTransforms the hypothesized transforms
   * @param reEval as used by scoreWord
   * @param doubling as used by scoreWord
   * @param optimization as used by moveTransformPairs
   * @param out the destination for any printing to the log
   */
  public void conservInference(
      Lexicon lex,
      List<Transform> learnedTransforms,
      List<Transform> hypTransforms,
      boolean reEval,
      boolean doubling,
      boolean optimization,
      PrintWriter out) {
    int newBaseCount = 0;
    int newPairCount = 0;

    // Get either a the latest transform from the learned list or for
    // overlap get the one passed in
    Transform newestTransform = learnedTransforms.get(learnedTransforms.size() - 1);
    Collection<Word> newWords = inferBases(lex, newestTransform);
    for (Word newBase : newWords) {
      // Add each new base to the lexicon and let the lexicon move it
      lex.addWord(newBase);
      lex.moveWord(newBase, WordSet.BASE);
      newBaseCount++;

      // Score each new base for every learned transform that can apply
      // to it, counting new pairs
      for (Transform trans : learnedTransforms) {
        if (newBase.hasAffix(trans.getAffix1())
            && Transform.scoreWord(trans, newBase, lex, reEval, doubling)) {
          newPairCount++;
        }
      }
    }

    // If we added words, update the frequencies
    if (newBaseCount > 0) {
      lex.updateFrequencies();
    }

    // Move all the words for each transform
    if (newPairCount > 0) {
      for (Transform trans : learnedTransforms) {
        lex.moveTransformPairs(trans, hypTransforms, optimization, reEval, doubling);
      }
    }

    // If optimization is on, score for all hypothesized transforms other
    // than the one just learned
    if (optimization) {
      for (Word newBase : newWords) {
        for (Transform trans : hypTransforms) {
          if (trans != newestTransform && newBase.hasAffix(trans.getAffix1())) {
            Transform.scoreWord(trans, newBase, lex, reEval, doubling);
          }
        }
      }
    }

    // Put the new words out to the log if we're outputting
    if (out != null) {
      for (Word newBase : newWords) {
        out.println(newBase.toDerivedWordsString());
      }
    }

    // Output the results
    System.out.println(newPairCount + " new pairs inferred by conservative inference.");
  }