예제 #1
0
  /**
   * This method extends an extracted phrase pair with unaligned boundary words on the target side
   *
   * @param sourceStartIndex
   * @param sourceEndIndex
   * @param targetStartIndex
   * @param targetEndIndex
   * @param a
   * @param sp
   * @return
   */
  private List<Rule> extendUnalignedBoundaryWord(
      int sourceStartIndex,
      int sourceEndIndex,
      int targetStartIndex,
      int targetEndIndex,
      Alignment a,
      SentencePair sp) {
    List<Rule> res = new ArrayList<Rule>();
    int prev = 0; // number of previous target unaligned words
    int targetExtendIndex = targetStartIndex - 1;
    // i>0 because i=0 is a position reserved for NULL. In this version, we
    // don't make
    // use of the alignment to NULL because we use symmetrized alignments
    // but that could
    // be changed
    while (targetExtendIndex >= 0 && !a.isTargetAligned(targetExtendIndex)) {
      prev++;
      Rule r = new Rule(sourceStartIndex, sourceEndIndex, targetExtendIndex, targetEndIndex, sp, a);
      res.add(r);
      targetExtendIndex--;
    }
    int foll = 0; // number of following target unaligned words
    targetExtendIndex = targetEndIndex + 1;
    while (targetExtendIndex < sp.getTarget().getWords().length
        && !a.isTargetAligned(targetExtendIndex)) {
      foll++;
      Rule r =
          new Rule(sourceStartIndex, sourceEndIndex, targetStartIndex, targetExtendIndex, sp, a);
      res.add(r);
      targetExtendIndex++;
    }

    if (prev > 0 && foll > 0) { // if there are unaligned words in both
      // sides:
      for (targetExtendIndex = 1; targetExtendIndex <= prev; targetExtendIndex++) { // for
        // each
        // start
        // (including
        // at
        // least
        // one
        // previous)
        int start = targetStartIndex - targetExtendIndex;
        for (int k = 1; k <= foll; k++) { // for each end (including at
          // least one following)
          int end = targetEndIndex + k;
          Rule r = new Rule(sourceStartIndex, sourceEndIndex, start, end, sp, a);
          res.add(r);
        }
      }
    }
    return res;
  }
예제 #2
0
 private Block findNextBlock(
     int sourceStartIndex,
     int sourceEndIndex,
     int targetStartIndex,
     int targetEndIndex,
     Alignment a,
     SentencePair sp) {
   if (targetStartIndex >= sp.getTarget().getWords().length
       || sourceStartIndex >= sp.getSource().getWords().length
       || !a.isTargetAligned(targetStartIndex)
       || !a.isSourceAligned(sourceStartIndex)) {
     return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex);
   }
   boolean cont = true;
   while (cont) {
     cont = false;
     for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) {
       if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) {
         targetEndIndex = a.getMaxTargetIndex(sourceIndex);
         cont = true;
       }
     }
     for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) {
       if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) {
         sourceEndIndex = a.getMaxSourceIndex(targetIndex);
         cont = true;
       }
     }
   }
   return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex);
 }
예제 #3
0
  private List<Block> getRegularBlocks(Alignment a, SentencePair sp) {
    List<Block> res = new ArrayList<Block>();
    int sourceStartIndex = 0;
    int sourceEndIndex = 0;
    int targetStartIndex = 0;
    int targetEndIndex = 0;
    while (sourceStartIndex <= sp.getSource().getWords().length
        && targetStartIndex <= sp.getTarget().getWords().length) {
      Block next =
          findNextBlock(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex, a, sp);

      sourceStartIndex = next.sourceStartIndex;
      sourceEndIndex = next.sourceEndIndex;
      targetStartIndex = next.targetStartIndex;
      targetEndIndex = next.targetEndIndex;

      if (targetStartIndex >= sp.getTarget().getWords().length
          && sourceStartIndex >= sp.getSource().getWords().length) {
        // do nothing
      } else if (targetStartIndex < sp.getTarget().getWords().length
          && !a.isTargetAligned(targetStartIndex)) {
        sourceEndIndex--;
      } else if (sourceStartIndex < sp.getSource().getWords().length
          && !a.isSourceAligned(sourceStartIndex)) {
        targetEndIndex--;
      }

      if (targetStartIndex >= sp.getTarget().getWords().length
          || sourceStartIndex >= sp.getSource().getWords().length) {
        break;
      }
      if (sourceStartIndex <= sourceEndIndex && targetStartIndex <= targetEndIndex) {
        res.add(new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex));
      }
      sourceStartIndex = sourceEndIndex + 1;
      targetStartIndex = targetEndIndex + 1;
      sourceEndIndex = sourceStartIndex;
      targetEndIndex = targetStartIndex;
    }
    return res;
  }
예제 #4
0
  /**
   * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for
   * testing.
   *
   * @param a
   * @param sp
   * @return
   */
  private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) {

    List<Rule> res = new ArrayList<Rule>();

    // loop over source index (beginning of phrase)
    for (int sourceStartIndex = 0;
        sourceStartIndex < sp.getSource().getWords().length;
        sourceStartIndex++) {
      // source phrase built on the fly
      List<Integer> sourcePhrase = new ArrayList<Integer>();
      // maintain the minimum and maximum target index aligned to the
      // source phrase
      int minTargetIndex = a.getMinTargetIndex(sourceStartIndex);
      int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex);
      // loop over source index (end of phrase)
      for (int sourceEndIndex = sourceStartIndex;
          sourceEndIndex
              < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length);
          sourceEndIndex++) {
        // update the sourcePhrase
        sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]);
        // update minimum and maximum target index aligned to the source
        // phrase
        int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex);
        int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex);
        if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate;
        if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate;
        if (minTargetIndex > maxTargetIndex) // occurs when haven't
          // found any aligned
          // word
          // yet
          continue;

        // check if the target phrase between positions minTargetIndex
        // and maxTargetIndex and the source phrase are consistent with
        // the alignment
        boolean consistent = true;
        List<Integer> targetPhrase = new ArrayList<Integer>();
        for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) {
          targetPhrase.add(sp.getTarget().getWords()[targetIndex]);
          if (a.isTargetAligned(targetIndex)
              && (a.getMinSourceIndex(targetIndex) < sourceStartIndex
                  || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) {
            consistent = false;
            break;
          }
        }

        // we found a phrase pair
        if (consistent) {
          // TODO the rule may be constructed on the fly
          // to avoid duplicated logic as well as duplicated loops.
          Rule r =
              new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a);
          res.add(r);
          List<Rule> extendedUnaligned =
              extendUnalignedBoundaryWord(
                  sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp);
          res.addAll(extendedUnaligned);
        }
      }
    }
    return res;
  }