示例#1
0
  /**
   * Trains this lexicon on the Collection of trees. Also trains the unknown word model pointed to
   * by this lexicon.
   */
  public void train(Collection<Tree> trees, double weight, boolean keepTagsAsLabels) {
    getUnknownWordModel().train(trees);

    // scan data
    for (Tree tree : trees) {
      List<IntTaggedWord> taggedWords = treeToEvents(tree, keepTagsAsLabels);
      for (int w = 0, sz = taggedWords.size(); w < sz; w++) {
        IntTaggedWord iTW = taggedWords.get(w);
        seenCounter.incrementCount(iTW, weight);
        IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
        seenCounter.incrementCount(iT, weight);
        IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
        seenCounter.incrementCount(iW, weight);
        IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);
        seenCounter.incrementCount(i, weight);
        // rules.add(iTW);
        tags.add(iT);
        words.add(iW);
      }
    }

    tune(trees);

    // index the possible tags for each word
    initRulesWithWord();

    if (DEBUG_LEXICON) {
      printLexStats();
    }
  }
示例#2
0
 @Override
 public void evaluate(Tree t1, Tree t2, PrintWriter pw) {
   Set<String> s1 = makeObjects(t1);
   Set<String> s2 = makeObjects(t2);
   for (String o1 : s1) {
     if (!s2.contains(o1)) {
       over.incrementCount(o1);
     }
   }
   for (String o2 : s2) {
     if (!s1.contains(o2)) {
       under.incrementCount(o2);
     }
   }
 }
示例#3
0
 public static <T> Counter<T> toCounter(List<FeatureValue<T>> featureValues) {
   ClassicCounter<T> counter = new ClassicCounter<T>();
   for (FeatureValue<T> fv : featureValues) {
     counter.incrementCount(fv.name, fv.value);
   }
   return counter;
 }
 @Override
 protected void tallyRoot(Tree lt, double weight) {
   // this list is in full (not reduced) tag space
   List<IntDependency> deps = MLEDependencyGrammar.treeToDependencyList(lt, wordIndex, tagIndex);
   for (IntDependency dependency : deps) {
     dependencyCounter.incrementCount(dependency, weight);
   }
 }
示例#5
0
 @Override
 public void evaluate(Tree t1, Tree t2, PrintWriter pw) {
   List<String> s1 = myMakeObjects(t1);
   List<String> s2 = myMakeObjects(t2);
   List<String> del2 = new LinkedList<>(s2);
   // we delete out as we find them so we can score correctly a cat with
   // a certain cardinality in a tree.
   for (String o1 : s1) {
     if (!del2.remove(o1)) {
       over.incrementCount(o1);
     }
   }
   for (String o2 : s2) {
     if (!s1.remove(o2)) {
       under.incrementCount(o2);
     }
   }
 }
 /** Make a copy of the array of counters. */
 public ClassicCounter<Integer>[] cloneCounter(ClassicCounter<Integer>[] counter) {
   ClassicCounter<Integer>[] newcount =
       ErasureUtils.<ClassicCounter<Integer>>mkTArray(ClassicCounter.class, counter.length);
   for (int xx = 0; xx < counter.length; xx++) {
     ClassicCounter<Integer> cc = new ClassicCounter<Integer>();
     newcount[xx] = cc;
     for (Integer key : counter[xx].keySet()) cc.incrementCount(key, counter[xx].getCount(key));
   }
   return newcount;
 }
  private void expandStop(
      IntDependency dependency, short distBinDist, double count, boolean wildForStop) {
    IntTaggedWord headT = getCachedITW(dependency.head.tag);
    IntTaggedWord head =
        new IntTaggedWord(dependency.head.word, tagBin(dependency.head.tag)); // dependency.head;
    IntTaggedWord arg =
        new IntTaggedWord(dependency.arg.word, tagBin(dependency.arg.tag)); // dependency.arg;

    boolean leftHeaded = dependency.leftHeaded;

    if (arg.word == STOP_WORD_INT) {
      stopCounter.incrementCount(intern(head, arg, leftHeaded, distBinDist), count);
      stopCounter.incrementCount(intern(headT, arg, leftHeaded, distBinDist), count);
    }
    if (wildForStop || arg.word != STOP_WORD_INT) {
      stopCounter.incrementCount(intern(head, wildTW, leftHeaded, distBinDist), count);
      stopCounter.incrementCount(intern(headT, wildTW, leftHeaded, distBinDist), count);
    }
  }
示例#8
0
 /** @param <T> */
 public static <T> List<FeatureValue<T>> combine(Collection<FeatureValue<T>> featureValues) {
   ClassicCounter<T> counter = new ClassicCounter<T>();
   for (FeatureValue<T> fv : featureValues) {
     counter.incrementCount(fv.name, fv.value);
   }
   Set<T> keys = new TreeSet<T>(counter.keySet());
   List<FeatureValue<T>> featureList = new ArrayList<FeatureValue<T>>(keys.size());
   for (T key : keys) {
     featureList.add(new FeatureValue<T>(key, counter.getCount(key)));
   }
   return featureList;
 }
  public UnknownWordModel finishTraining() {
    // make sure the unseen counter isn't empty!  If it is, put in
    // a uniform unseen over tags
    if (unSeenCounter.isEmpty()) {
      int numTags = tagIndex.size();
      for (int tt = 0; tt < numTags; tt++) {
        if (!Lexicon.BOUNDARY_TAG.equals(tagIndex.get(tt))) {
          IntTaggedWord iT = new IntTaggedWord(nullWord, tt);
          IntTaggedWord i = NULL_ITW;
          unSeenCounter.incrementCount(iT);
          unSeenCounter.incrementCount(i);
        }
      }
    }

    // index the possible tags for each word
    // numWords = wordIndex.size();
    // unknownWordIndex = wordIndex.indexOf(Lexicon.UNKNOWN_WORD, true);
    // initRulesWithWord();

    return model;
  }
  private void writeObject(ObjectOutputStream stream) throws IOException {
    //    System.err.println("\nBefore compression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    ClassicCounter<IntDependency> fullArgCounter = argCounter;
    argCounter = new ClassicCounter<IntDependency>();
    for (IntDependency dependency : fullArgCounter.keySet()) {
      if (dependency.head != wildTW
          && dependency.arg != wildTW
          && dependency.head.word != -1
          && dependency.arg.word != -1) {
        argCounter.incrementCount(dependency, fullArgCounter.getCount(dependency));
      }
    }

    ClassicCounter<IntDependency> fullStopCounter = stopCounter;
    stopCounter = new ClassicCounter<IntDependency>();
    for (IntDependency dependency : fullStopCounter.keySet()) {
      if (dependency.head.word != -1) {
        stopCounter.incrementCount(dependency, fullStopCounter.getCount(dependency));
      }
    }

    //    System.err.println("After compression:");
    //    System.err.println("arg size: " + argCounter.size() + "  total: " +
    // argCounter.totalCount());
    //    System.err.println("stop size: " + stopCounter.size() + "  total: " +
    // stopCounter.totalCount());

    stream.defaultWriteObject();

    argCounter = fullArgCounter;
    stopCounter = fullStopCounter;
  }
  /** Trains this UWM on the Collection of trees. */
  public void train(TaggedWord tw, int loc, double weight) {
    IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
    IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
    IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
    seenCounter.incrementCount(iW, weight);
    IntTaggedWord i = NULL_ITW;

    if (treesRead > indexToStartUnkCounting) {
      // start doing this once some way through trees;
      // treesRead is 1 based counting
      if (seenCounter.getCount(iW) < 1.5) {
        // it's an entirely unknown word
        int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word));
        if (DOCUMENT_UNKNOWNS) {
          String wStr = wordIndex.get(iTW.word);
          String tStr = tagIndex.get(iTW.tag);
          String sStr = wordIndex.get(s);
          EncodingPrintWriter.err.println(
              "Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8");
        }
        IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
        IntTaggedWord iS = new IntTaggedWord(s, nullTag);
        unSeenCounter.incrementCount(iTS, weight);
        unSeenCounter.incrementCount(iT, weight);
        unSeenCounter.incrementCount(iS, weight);
        unSeenCounter.incrementCount(i, weight);
        // rules.add(iTS);
        // sigs.add(iS);
      } // else {
      // if (seenCounter.getCount(iTW) < 2) {
      // it's a new tag for a known word
      // do nothing for now
      // }
      // }
    }
  }
  @Override
  public void train(List<TaggedWord> sentence) {
    lex.train(sentence, 1.0);

    String last = null;
    for (TaggedWord tagLabel : sentence) {
      String tag = tagLabel.tag();
      tagIndex.add(tag);
      if (last == null) {
        initial.incrementCount(tag);
      } else {
        ruleCounter.incrementCount2D(last, tag);
      }
      last = tag;
    }
  }
示例#13
0
 /** Adds the tagging with count to the data structures in this Lexicon. */
 protected void addTagging(boolean seen, IntTaggedWord itw, double count) {
   if (seen) {
     seenCounter.incrementCount(itw, count);
     if (itw.tag() == nullTag) {
       words.add(itw);
     } else if (itw.word() == nullWord) {
       tags.add(itw);
     } else {
       // rules.add(itw);
     }
   } else {
     uwModel.addTagging(seen, itw, count);
     // if (itw.tag() == nullTag) {
     // sigs.add(itw);
     // }
   }
 }
 private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) {
   // CharacterLevelTagExtender ext = new CharacterLevelTagExtender();
   ClassicCounter<Integer> c = new ClassicCounter<Integer>();
   for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) {
     Tree gold = (Tree) iterator.next();
     StringBuilder goldChars = new StringBuilder();
     ArrayList goldYield = gold.yield();
     for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) {
       Word word = (Word) wordIter.next();
       goldChars.append(word);
     }
     List<HasWord> ourWords = segment(goldChars.toString());
     for (int i = 0; i < ourWords.size(); i++) {
       c.incrementCount(Integer.valueOf(ourWords.get(i).word().length()));
     }
   }
   return Distribution.getDistribution(c);
 }
示例#15
0
 public ClassicCounter<L> scoresOf(RVFDatum<L, F> example) {
   ClassicCounter<L> scores = new ClassicCounter<>();
   Counters.addInPlace(scores, priors);
   if (addZeroValued) {
     Counters.addInPlace(scores, priorZero);
   }
   for (L l : labels) {
     double score = 0.0;
     Counter<F> features = example.asFeaturesCounter();
     for (F f : features.keySet()) {
       int value = (int) features.getCount(f);
       score += weight(l, f, Integer.valueOf(value));
       if (addZeroValued) {
         score -= weight(l, f, zero);
       }
     }
     scores.incrementCount(l, score);
   }
   return scores;
 }
  /**
   * Collect counts for a non-STOP dependent. The dependency arg is still in the full tag space.
   *
   * @param dependency A non-stop dependency
   * @param valBinDist A binned distance
   * @param count The weight with which to add this dependency
   */
  private void expandArg(IntDependency dependency, short valBinDist, double count) {
    IntTaggedWord headT = getCachedITW(dependency.head.tag);
    IntTaggedWord argT = getCachedITW(dependency.arg.tag);
    IntTaggedWord head =
        new IntTaggedWord(dependency.head.word, tagBin(dependency.head.tag)); // dependency.head;
    IntTaggedWord arg =
        new IntTaggedWord(dependency.arg.word, tagBin(dependency.arg.tag)); // dependency.arg;
    boolean leftHeaded = dependency.leftHeaded;

    // argCounter stores stuff in both the original and the reduced tag space???
    argCounter.incrementCount(intern(head, arg, leftHeaded, valBinDist), count);
    argCounter.incrementCount(intern(headT, arg, leftHeaded, valBinDist), count);
    argCounter.incrementCount(intern(head, argT, leftHeaded, valBinDist), count);
    argCounter.incrementCount(intern(headT, argT, leftHeaded, valBinDist), count);

    argCounter.incrementCount(intern(head, wildTW, leftHeaded, valBinDist), count);
    argCounter.incrementCount(intern(headT, wildTW, leftHeaded, valBinDist), count);

    // the WILD head stats are always directionless and not useDistance!
    argCounter.incrementCount(intern(wildTW, arg, false, (short) -1), count);
    argCounter.incrementCount(intern(wildTW, argT, false, (short) -1), count);

    if (useSmoothTagProjection) {
      // added stuff to do more smoothing.  CDM Jan 2007
      IntTaggedWord headP =
          new IntTaggedWord(dependency.head.word, tagProject(dependency.head.tag));
      IntTaggedWord headTP = new IntTaggedWord(ANY_WORD_INT, tagProject(dependency.head.tag));
      IntTaggedWord argP = new IntTaggedWord(dependency.arg.word, tagProject(dependency.arg.tag));
      IntTaggedWord argTP = new IntTaggedWord(ANY_WORD_INT, tagProject(dependency.arg.tag));

      argCounter.incrementCount(intern(headP, argP, leftHeaded, valBinDist), count);
      argCounter.incrementCount(intern(headTP, argP, leftHeaded, valBinDist), count);
      argCounter.incrementCount(intern(headP, argTP, leftHeaded, valBinDist), count);
      argCounter.incrementCount(intern(headTP, argTP, leftHeaded, valBinDist), count);

      argCounter.incrementCount(intern(headP, wildTW, leftHeaded, valBinDist), count);
      argCounter.incrementCount(intern(headTP, wildTW, leftHeaded, valBinDist), count);

      // the WILD head stats are always directionless and not useDistance!
      argCounter.incrementCount(intern(wildTW, argP, false, (short) -1), count);
      argCounter.incrementCount(intern(wildTW, argTP, false, (short) -1), count);
      argCounter.incrementCount(
          intern(wildTW, new IntTaggedWord(dependency.head.word, ANY_TAG_INT), false, (short) -1),
          count);
    }
    numWordTokens++;
  }