Exemplo n.º 1
0
 /**
  * Reads parse trees either from standard input or a specified file, converting them to sentences
  * and printing those sentences on standard output.
  *
  * <pre>
  * usage: [-v|-help|-usage] [-tags] [filename]
  *         -v|-help|-usage: prints out this message
  *         -tags: indicates to spit out one S-expression per word, of the form
  *                 (word (tag))
  *         filename is the file to be processed (standard input is assumed if
  *                 this argument is "-" or is not present)
  * </pre>
  */
 public static void main(String[] args) {
   InputStream inStream = System.in;
   boolean tags = false;
   String inFile = null;
   for (int i = 0; i < args.length; i++) {
     if (args[i].equals("-help") || args[i].equals("-usage") || args[i].equals("-v")) {
       usage();
       return;
     } else if (args[i].equals("-tags")) tags = true;
     else if (!args[i].equals("-")) inFile = args[i];
   }
   if (inFile != null) {
     try {
       inStream = new FileInputStream(inFile);
     } catch (FileNotFoundException fnfe) {
       System.err.println(fnfe);
       System.exit(-1);
     }
   }
   try {
     SexpTokenizer tok = new SexpTokenizer(inStream, Language.encoding(), bufSize);
     OutputStream os = System.out;
     Writer writer = new BufferedWriter(new OutputStreamWriter(os, Language.encoding()));
     PrintWriter pw = new PrintWriter(writer);
     Sexp curr = null;
     while ((curr = Sexp.read(tok)) != null)
       pw.println(tags ? Util.collectTaggedWords(curr) : Util.collectLeaves(curr));
     pw.flush();
     pw.close();
   } catch (Exception e) {
     System.out.println(e);
   }
 }
  public Event getHistory(TrainerEvent trainerEvent, int backOffLevel) {
    ModifierEvent modEvent = (ModifierEvent) trainerEvent;

    if (Language.treebank().isBaseNP(modEvent.parent()))
      return getBaseNPHistory(modEvent, backOffLevel);

    Symbol side = Constants.sideToSym(modEvent.side());

    MutableEvent hist = historiesWithSubcats[backOffLevel];

    hist.clear();
    Symbol verbInterveningSym = Constants.booleanToSym(modEvent.verbIntervening());
    Symbol mappedPrevModSym = NTMapper.map(modEvent.previousMods().symbolAt(0));
    Symbol parent = Language.training().removeArgAugmentation(modEvent.parent());

    switch (backOffLevel) {
      case 0:
        // for p(M(t)_i | P, H, w, t, verbIntervening, map(M_i-1), subcat, side)
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, modEvent.headWord().word());
        hist.add(0, modEvent.headWord().tag());
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
      case 1:
        // for p(M(t)_i | P, H, t, verbIntervening, map(M_i-1), subcat, side)
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, modEvent.headWord().tag());
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
      case 2:
        // for p(M(t)_i | P, H, verbIntervening, map(M_i-1), subcat, side)
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
      case 3:
        // for p(M(t)_i | P, subcat)
        hist.add(0, parent);
        hist.add(1, modEvent.subcat());
        break;
    }
    return hist;
  }
Exemplo n.º 3
0
  private Event getBaseNPHistory(ModifierEvent modEvent, int backOffLevel) {
    MutableEvent hist = histories[backOffLevel];

    Symbol side = Constants.sideToSym(modEvent.side());

    Symbol prevModLabel =
        (modEvent.previousMods().get(0) == startSym
            ? modEvent.head()
            : modEvent.previousMods().symbolAt(0));
    Word prevModWord =
        (modEvent.previousWords().getWord(0).equals(startWord)
            ? modEvent.headWord()
            : modEvent.previousWords().getWord(0));
    hist.clear();
    switch (backOffLevel) {
      case 0:
        // for p(t_i | M_i, P, M(w,t)_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(prevModWord.word());
        hist.add(prevModWord.tag());
        hist.add(side);
        break;
      case 1:
        // for p(t_i | M_i, P, M(t)_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(prevModWord.tag());
        hist.add(side);
        break;
      case 2:
        // for p(t_i | M_i, P, M_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(side);
        break;
        /*
        case 3:
          // for p(t_i | M_i, P)
          hist.add(modEvent.modifier());
          hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
          break;
          */
    }
    return hist;
  }
Exemplo n.º 4
0
/**
 * Representation of the complete back-off structure of the generation model for modifying
 * part-of-speech tags (the modifying nonterminals are partially lexicalized with the parts of
 * speech of their respective head words, and this model generates the part of speech component of
 * these partially-lexicalized nonterminals).
 *
 * <p><b>It is a horrendous bug that all of these <code>ProbabilityStructure</code> classes do not
 * copy various lists from the <code>TrainerEvent</code> objects before removing gap augmentations
 * from their elements.</b>
 *
 * <p>
 */
public class TagModelStructure2 extends ProbabilityStructure {
  // data members
  private static Symbol startSym = Language.training().startSym();
  private static Word startWord = Language.training().startWord();
  private Symbol topSym = Language.training().topSym();

  public TagModelStructure2() {
    super();
  }

  public int maxEventComponents() {
    return 9;
  }

  public int numLevels() {
    return 3;
  }

  public Event getHistory(TrainerEvent trainerEvent, int backOffLevel) {
    ModifierEvent modEvent = (ModifierEvent) trainerEvent;

    if (Language.treebank().isBaseNP(modEvent.parent()))
      return getBaseNPHistory(modEvent, backOffLevel);

    Symbol side = Constants.sideToSym(modEvent.side());

    MutableEvent hist = historiesWithSubcats[backOffLevel];

    hist.clear();
    Symbol verbInterveningSym = Constants.booleanToSym(modEvent.verbIntervening());
    Symbol mappedPrevModSym = NTMapper.map(modEvent.previousMods().symbolAt(0));
    Symbol parent = Language.training().removeArgAugmentation(modEvent.parent());

    switch (backOffLevel) {
      case 0:
        // for p(t_i | M_i, P, H, w, t, verbIntervening, map(M_i-1),
        //             subcat, side)
        hist.add(0, modEvent.modifier());
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, modEvent.headWord().word());
        hist.add(0, modEvent.headWord().tag());
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
      case 1:
        // for p(t_i | M_i, P, H, t, verbIntervening, map(M_i-1), subcat, side)
        hist.add(0, modEvent.modifier());
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, modEvent.headWord().tag());
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
      case 2:
        // for p(t_i | M_i, P, H, verbIntervening, map(M_i-1), subcat, side)
        hist.add(0, modEvent.modifier());
        hist.add(0, parent);
        hist.add(0, Language.training().removeGapAugmentation(modEvent.head()));
        hist.add(0, verbInterveningSym);
        hist.add(0, mappedPrevModSym);
        hist.add(1, modEvent.subcat());
        hist.add(0, side);
        break;
        /*
        case 3:
          // for p(t_i | M_i, P, subcat)
          hist.add(0, modEvent.modifier());
          hist.add(0, parent);
          hist.add(1, modEvent.subcat());
          break;
          */
    }
    return hist;
  }

  private Event getBaseNPHistory(ModifierEvent modEvent, int backOffLevel) {
    MutableEvent hist = histories[backOffLevel];

    Symbol side = Constants.sideToSym(modEvent.side());

    Symbol prevModLabel =
        (modEvent.previousMods().get(0) == startSym
            ? modEvent.head()
            : modEvent.previousMods().symbolAt(0));
    Word prevModWord =
        (modEvent.previousWords().getWord(0).equals(startWord)
            ? modEvent.headWord()
            : modEvent.previousWords().getWord(0));
    hist.clear();
    switch (backOffLevel) {
      case 0:
        // for p(t_i | M_i, P, M(w,t)_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(prevModWord.word());
        hist.add(prevModWord.tag());
        hist.add(side);
        break;
      case 1:
        // for p(t_i | M_i, P, M(t)_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(prevModWord.tag());
        hist.add(side);
        break;
      case 2:
        // for p(t_i | M_i, P, M_i-1, side)
        hist.add(modEvent.modifier());
        hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
        hist.add(prevModLabel);
        hist.add(side);
        break;
        /*
        case 3:
          // for p(t_i | M_i, P)
          hist.add(modEvent.modifier());
          hist.add(Language.training().removeGapAugmentation(modEvent.parent()));
          break;
          */
    }
    return hist;
  }

  public Event getFuture(TrainerEvent trainerEvent, int backOffLevel) {
    ModifierEvent modEvent = (ModifierEvent) trainerEvent;
    MutableEvent future = futures[backOffLevel];
    future.clear();
    future.add(modEvent.modHeadWord().tag());
    return future;
  }

  public boolean doCleanup() {
    return true;
  }

  /**
   * In order to gather statistics for words that appear as the head of the entire sentence when
   * estimating p(w | t), the trainer "fakes" a modifier event, as though the root node of the
   * observed tree was seen to modify the magical +TOP+ node. We will never use the derived counts
   * whose history contexts contain +TOP+. This method allows for the removal of these "unnecessary"
   * counts, which will never be used when decoding.
   */
  public boolean removeHistory(int backOffLevel, Event history) {
    return history.get(0, 1) == topSym;
  }

  public ProbabilityStructure copy() {
    return new TagModelStructure2();
  }
}