Beispiel #1
0
 private Pair<Integer, Integer> updateTargetLimit(
     Alignment a, int sourceStartIndex, int sourceEndIndex) {
   Pair<Integer, Integer> res = new Pair<Integer, Integer>();
   int minTargetIndex = a.getMinTargetIndex(sourceStartIndex);
   int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex);
   for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) {
     int minTargetIndexCandidate = a.getMinTargetIndex(sourceIndex);
     int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceIndex);
     if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate;
     if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate;
   }
   res.setFirst(minTargetIndex);
   res.setSecond(maxTargetIndex);
   return res;
 }
Beispiel #2
0
  /**
   * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for
   * testing.
   *
   * @param a
   * @param sp
   * @return
   */
  private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) {

    List<Rule> res = new ArrayList<Rule>();

    // loop over source index (beginning of phrase)
    for (int sourceStartIndex = 0;
        sourceStartIndex < sp.getSource().getWords().length;
        sourceStartIndex++) {
      // source phrase built on the fly
      List<Integer> sourcePhrase = new ArrayList<Integer>();
      // maintain the minimum and maximum target index aligned to the
      // source phrase
      int minTargetIndex = a.getMinTargetIndex(sourceStartIndex);
      int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex);
      // loop over source index (end of phrase)
      for (int sourceEndIndex = sourceStartIndex;
          sourceEndIndex
              < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length);
          sourceEndIndex++) {
        // update the sourcePhrase
        sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]);
        // update minimum and maximum target index aligned to the source
        // phrase
        int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex);
        int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex);
        if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate;
        if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate;
        if (minTargetIndex > maxTargetIndex) // occurs when haven't
          // found any aligned
          // word
          // yet
          continue;

        // check if the target phrase between positions minTargetIndex
        // and maxTargetIndex and the source phrase are consistent with
        // the alignment
        boolean consistent = true;
        List<Integer> targetPhrase = new ArrayList<Integer>();
        for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) {
          targetPhrase.add(sp.getTarget().getWords()[targetIndex]);
          if (a.isTargetAligned(targetIndex)
              && (a.getMinSourceIndex(targetIndex) < sourceStartIndex
                  || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) {
            consistent = false;
            break;
          }
        }

        // we found a phrase pair
        if (consistent) {
          // TODO the rule may be constructed on the fly
          // to avoid duplicated logic as well as duplicated loops.
          Rule r =
              new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a);
          res.add(r);
          List<Rule> extendedUnaligned =
              extendUnalignedBoundaryWord(
                  sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp);
          res.addAll(extendedUnaligned);
        }
      }
    }
    return res;
  }