Ejemplo n.º 1
0
  private List<Rule> extractRulesTwoNonTerminal(
      int sourceStartIndex,
      int sourceEndIndex,
      int minTargetIndex,
      int maxTargetIndex,
      Alignment a,
      SentencePair sp) {
    List<Rule> res = new ArrayList<Rule>();
    if (sourceEndIndex - sourceStartIndex < 2) { // we want at least 2
      // source words to
      // extract
      // a rule with two
      // nonterminals
      return res;
    }
    for (int sourceStartIndexX = sourceStartIndex;
        sourceStartIndexX < sourceEndIndex - 1;
        sourceStartIndexX++) {
      for (int sourceEndIndexX = sourceStartIndexX;
          sourceEndIndexX < sourceEndIndex - 1;
          sourceEndIndexX++) {
        if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) break;
        Pair<Integer, Integer> targetLimit =
            updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX);
        int offset = 0;
        for (int targetIndex = targetLimit.getFirst();
            targetIndex <= targetLimit.getSecond();
            targetIndex++) {
          if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) {
            offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX;
            break;
          }
          if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) {
            offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX;
            break;
          }
        }
        if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) {
          break;
        }
        if (offset < 0) {
          break;
        } else if (offset > 0) {
          sourceEndIndexX = sourceEndIndexX + offset - 1;
        } else {
          for (int sourceStartIndexX2 = sourceEndIndexX + 2;
              sourceStartIndexX2 <= sourceEndIndex;
              sourceStartIndexX2++) {
            for (int sourceEndIndexX2 = sourceStartIndexX2;
                sourceEndIndexX2 <= sourceEndIndex;
                sourceEndIndexX2++) {
              if (!a.isSourceAligned(sourceStartIndexX2) || !a.isSourceAligned(sourceEndIndexX2)) {
                break;
              }
              Pair<Integer, Integer> targetLimitX2 =
                  updateTargetLimit(a, sourceStartIndexX2, sourceEndIndexX2);
              int offset2 = 0;
              for (int targetIndex = targetLimitX2.getFirst();
                  targetIndex <= targetLimitX2.getSecond();
                  targetIndex++) {
                if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX2) {
                  offset2 = a.getMinSourceIndex(targetIndex) - sourceStartIndexX2;
                  break;
                }
                if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX2) {
                  offset2 = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX2;
                  break;
                }
              }

              if (offset2 < 0) break;
              else if (offset2 > 0) {
                sourceEndIndexX2 = sourceEndIndexX2 + offset2 - 1;
              } else {
                if (filterPassTwoNonTerminalRule(
                    sourceStartIndex,
                    sourceEndIndex,
                    sourceStartIndexX,
                    sourceEndIndexX,
                    sourceStartIndexX2,
                    sourceEndIndexX2,
                    a)) {
                  Rule r =
                      new Rule(
                          sourceStartIndex,
                          sourceEndIndex,
                          minTargetIndex,
                          maxTargetIndex,
                          sourceStartIndexX,
                          sourceEndIndexX,
                          targetLimit.getFirst(),
                          targetLimit.getSecond(),
                          sourceStartIndexX2,
                          sourceEndIndexX2,
                          targetLimitX2.getFirst(),
                          targetLimitX2.getSecond(),
                          sp,
                          a);
                  res.add(r);
                }
              }
            }
          }
        }
      }
    }
    return res;
  }
Ejemplo n.º 2
0
 private List<Rule> extractInternalBlockRules(Block regularBlock, Alignment a, SentencePair sp) {
   List<Rule> res = new ArrayList<Rule>();
   for (int sourceStartIndex = regularBlock.sourceStartIndex;
       sourceStartIndex <= regularBlock.sourceEndIndex;
       sourceStartIndex++) {
     for (int sourceEndIndex = sourceStartIndex + 1;
         sourceEndIndex <= regularBlock.sourceEndIndex;
         sourceEndIndex++) {
       // check links and update target limit
       Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndex, sourceEndIndex);
       int offset = 0;
       for (int targetIndex = targetLimit.getFirst();
           targetIndex <= targetLimit.getSecond();
           targetIndex++) {
         if (a.getMinSourceIndex(targetIndex) < sourceStartIndex) {
           offset = a.getMinSourceIndex(targetIndex) - sourceStartIndex;
           break;
         }
         if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) {
           offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndex;
           break;
         }
       }
       if (offset < 0) {
         break; // if negative offset, jump to another
         // sourceStartIndex (end the sourceEndIndex for
         // loop)
       } else if (offset > 0) {
         sourceEndIndex = sourceEndIndex + offset - 1; // if
         // positive,
         // add
         // offset
         // to jump
         // to
         // the
         // adequate
         // sourceEndIndex
       } else { // zero offset, found a plausible subregion
         // System.err.println("Extracting rules one nonterminal "
         // +
         // sourceStartIndex + " " + sourceEndIndex + " " +
         // minTargetIndex + " " + maxTargetIndex);
         res.addAll(
             extractRulesOneNonTerminal(
                 sourceStartIndex,
                 sourceEndIndex,
                 targetLimit.getFirst(),
                 targetLimit.getSecond(),
                 a,
                 sp));
         res.addAll(
             extractRulesTwoNonTerminal(
                 sourceStartIndex,
                 sourceEndIndex,
                 targetLimit.getFirst(),
                 targetLimit.getSecond(),
                 a,
                 sp));
       }
     }
   }
   return res;
 }
Ejemplo n.º 3
0
 private List<Rule> extractRulesOneNonTerminal(
     int sourceStartIndex,
     int sourceEndIndex,
     int minTargetIndex,
     int maxTargetIndex,
     Alignment a,
     SentencePair sp) {
   List<Rule> res = new ArrayList<Rule>();
   int offset = 0;
   for (int sourceStartIndexX = sourceStartIndex;
       sourceStartIndexX <= sourceEndIndex;
       sourceStartIndexX++) {
     for (int sourceEndIndexX = sourceStartIndexX;
         sourceEndIndexX <= sourceEndIndex;
         sourceEndIndexX++) {
       if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) {
         break;
       }
       Pair<Integer, Integer> targetLimit =
           updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX);
       offset = 0;
       for (int targetIndex = targetLimit.getFirst();
           targetIndex <= targetLimit.getSecond();
           targetIndex++) {
         if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) {
           offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX;
           break;
         }
         if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) {
           offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX;
           break;
         }
       }
       if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) {
         break;
       }
       if (offset < 0) {
         break;
       }
       if (offset > 0) {
         sourceEndIndexX = sourceEndIndexX + offset - 1;
       } else if (sourceStartIndexX != sourceStartIndex || sourceEndIndexX != sourceEndIndex) {
         if (filterPassOneNonTerminalRule(
             sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX)) {
           Rule r =
               new Rule(
                   sourceStartIndex,
                   sourceEndIndex,
                   minTargetIndex,
                   maxTargetIndex,
                   sourceStartIndexX,
                   sourceEndIndexX,
                   targetLimit.getFirst(),
                   targetLimit.getSecond(),
                   sp,
                   a);
           res.add(r);
         }
       }
     }
   }
   return res;
 }
Ejemplo n.º 4
0
  /**
   * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for
   * testing.
   *
   * @param a
   * @param sp
   * @return
   */
  private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) {

    List<Rule> res = new ArrayList<Rule>();

    // loop over source index (beginning of phrase)
    for (int sourceStartIndex = 0;
        sourceStartIndex < sp.getSource().getWords().length;
        sourceStartIndex++) {
      // source phrase built on the fly
      List<Integer> sourcePhrase = new ArrayList<Integer>();
      // maintain the minimum and maximum target index aligned to the
      // source phrase
      int minTargetIndex = a.getMinTargetIndex(sourceStartIndex);
      int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex);
      // loop over source index (end of phrase)
      for (int sourceEndIndex = sourceStartIndex;
          sourceEndIndex
              < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length);
          sourceEndIndex++) {
        // update the sourcePhrase
        sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]);
        // update minimum and maximum target index aligned to the source
        // phrase
        int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex);
        int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex);
        if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate;
        if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate;
        if (minTargetIndex > maxTargetIndex) // occurs when haven't
          // found any aligned
          // word
          // yet
          continue;

        // check if the target phrase between positions minTargetIndex
        // and maxTargetIndex and the source phrase are consistent with
        // the alignment
        boolean consistent = true;
        List<Integer> targetPhrase = new ArrayList<Integer>();
        for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) {
          targetPhrase.add(sp.getTarget().getWords()[targetIndex]);
          if (a.isTargetAligned(targetIndex)
              && (a.getMinSourceIndex(targetIndex) < sourceStartIndex
                  || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) {
            consistent = false;
            break;
          }
        }

        // we found a phrase pair
        if (consistent) {
          // TODO the rule may be constructed on the fly
          // to avoid duplicated logic as well as duplicated loops.
          Rule r =
              new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a);
          res.add(r);
          List<Rule> extendedUnaligned =
              extendUnalignedBoundaryWord(
                  sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp);
          res.addAll(extendedUnaligned);
        }
      }
    }
    return res;
  }