Exemplo n.º 1
0
 private Block findNextBlock(
     int sourceStartIndex,
     int sourceEndIndex,
     int targetStartIndex,
     int targetEndIndex,
     Alignment a,
     SentencePair sp) {
   if (targetStartIndex >= sp.getTarget().getWords().length
       || sourceStartIndex >= sp.getSource().getWords().length
       || !a.isTargetAligned(targetStartIndex)
       || !a.isSourceAligned(sourceStartIndex)) {
     return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex);
   }
   boolean cont = true;
   while (cont) {
     cont = false;
     for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) {
       if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) {
         targetEndIndex = a.getMaxTargetIndex(sourceIndex);
         cont = true;
       }
     }
     for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) {
       if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) {
         sourceEndIndex = a.getMaxSourceIndex(targetIndex);
         cont = true;
       }
     }
   }
   return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex);
 }
Exemplo n.º 2
0
 /**
  * @param sourceStartIndex
  * @param sourceEndIndex
  * @param sourceStartIndexX
  * @param sourceEndIndexX
  * @param sourceStartIndexX2
  * @param sourceEndIndexX2
  * @return
  */
 private boolean filterPassTwoNonTerminalRule(
     int sourceStartIndex,
     int sourceEndIndex,
     int sourceStartIndexX,
     int sourceEndIndexX,
     int sourceStartIndexX2,
     int sourceEndIndexX2,
     Alignment a) {
   boolean middleTerminalAligned = false;
   for (int sourceIndex = sourceEndIndexX + 1; sourceIndex < sourceStartIndexX2; sourceIndex++) {
     if (a.isSourceAligned(sourceIndex)) {
       middleTerminalAligned = true;
       break;
     }
   }
   boolean res =
       (middleTerminalAligned
           && ((sourceEndIndex - sourceStartIndex + 1)
                   - (sourceEndIndexX - sourceStartIndexX + 1)
                   - (sourceEndIndexX2 - sourceStartIndexX2 + 1)
                   + 2)
               <= MAX_SOURCE_ELEMENTS
           && sourceStartIndexX - sourceStartIndex <= MAX_TERMINAL_LENGTH
           && sourceStartIndexX2 - sourceEndIndexX <= MAX_TERMINAL_LENGTH
           && sourceEndIndex - sourceEndIndexX2 <= MAX_TERMINAL_LENGTH
           && sourceEndIndexX - sourceStartIndexX + 1 <= MAX_NONTERMINAL_SPAN
           && sourceEndIndexX2 - sourceStartIndexX2 + 1 <= MAX_NONTERMINAL_SPAN);
   if (!res) {
     return res;
   }
   if (REMOVE_MONOTONIC_REPEATS
       && isMonotonicRepeatTwoNonterminal(
           sourceStartIndex,
           sourceEndIndex,
           sourceStartIndexX,
           sourceEndIndexX,
           sourceStartIndexX2,
           sourceEndIndexX2)) {
     return false;
   }
   return true;
 }
Exemplo n.º 3
0
  private List<Block> getRegularBlocks(Alignment a, SentencePair sp) {
    List<Block> res = new ArrayList<Block>();
    int sourceStartIndex = 0;
    int sourceEndIndex = 0;
    int targetStartIndex = 0;
    int targetEndIndex = 0;
    while (sourceStartIndex <= sp.getSource().getWords().length
        && targetStartIndex <= sp.getTarget().getWords().length) {
      Block next =
          findNextBlock(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex, a, sp);

      sourceStartIndex = next.sourceStartIndex;
      sourceEndIndex = next.sourceEndIndex;
      targetStartIndex = next.targetStartIndex;
      targetEndIndex = next.targetEndIndex;

      if (targetStartIndex >= sp.getTarget().getWords().length
          && sourceStartIndex >= sp.getSource().getWords().length) {
        // do nothing
      } else if (targetStartIndex < sp.getTarget().getWords().length
          && !a.isTargetAligned(targetStartIndex)) {
        sourceEndIndex--;
      } else if (sourceStartIndex < sp.getSource().getWords().length
          && !a.isSourceAligned(sourceStartIndex)) {
        targetEndIndex--;
      }

      if (targetStartIndex >= sp.getTarget().getWords().length
          || sourceStartIndex >= sp.getSource().getWords().length) {
        break;
      }
      if (sourceStartIndex <= sourceEndIndex && targetStartIndex <= targetEndIndex) {
        res.add(new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex));
      }
      sourceStartIndex = sourceEndIndex + 1;
      targetStartIndex = targetEndIndex + 1;
      sourceEndIndex = sourceStartIndex;
      targetEndIndex = targetStartIndex;
    }
    return res;
  }
Exemplo n.º 4
0
  private List<Rule> extractRulesTwoNonTerminal(
      int sourceStartIndex,
      int sourceEndIndex,
      int minTargetIndex,
      int maxTargetIndex,
      Alignment a,
      SentencePair sp) {
    List<Rule> res = new ArrayList<Rule>();
    if (sourceEndIndex - sourceStartIndex < 2) { // we want at least 2
      // source words to
      // extract
      // a rule with two
      // nonterminals
      return res;
    }
    for (int sourceStartIndexX = sourceStartIndex;
        sourceStartIndexX < sourceEndIndex - 1;
        sourceStartIndexX++) {
      for (int sourceEndIndexX = sourceStartIndexX;
          sourceEndIndexX < sourceEndIndex - 1;
          sourceEndIndexX++) {
        if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) break;
        Pair<Integer, Integer> targetLimit =
            updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX);
        int offset = 0;
        for (int targetIndex = targetLimit.getFirst();
            targetIndex <= targetLimit.getSecond();
            targetIndex++) {
          if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) {
            offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX;
            break;
          }
          if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) {
            offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX;
            break;
          }
        }
        if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) {
          break;
        }
        if (offset < 0) {
          break;
        } else if (offset > 0) {
          sourceEndIndexX = sourceEndIndexX + offset - 1;
        } else {
          for (int sourceStartIndexX2 = sourceEndIndexX + 2;
              sourceStartIndexX2 <= sourceEndIndex;
              sourceStartIndexX2++) {
            for (int sourceEndIndexX2 = sourceStartIndexX2;
                sourceEndIndexX2 <= sourceEndIndex;
                sourceEndIndexX2++) {
              if (!a.isSourceAligned(sourceStartIndexX2) || !a.isSourceAligned(sourceEndIndexX2)) {
                break;
              }
              Pair<Integer, Integer> targetLimitX2 =
                  updateTargetLimit(a, sourceStartIndexX2, sourceEndIndexX2);
              int offset2 = 0;
              for (int targetIndex = targetLimitX2.getFirst();
                  targetIndex <= targetLimitX2.getSecond();
                  targetIndex++) {
                if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX2) {
                  offset2 = a.getMinSourceIndex(targetIndex) - sourceStartIndexX2;
                  break;
                }
                if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX2) {
                  offset2 = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX2;
                  break;
                }
              }

              if (offset2 < 0) break;
              else if (offset2 > 0) {
                sourceEndIndexX2 = sourceEndIndexX2 + offset2 - 1;
              } else {
                if (filterPassTwoNonTerminalRule(
                    sourceStartIndex,
                    sourceEndIndex,
                    sourceStartIndexX,
                    sourceEndIndexX,
                    sourceStartIndexX2,
                    sourceEndIndexX2,
                    a)) {
                  Rule r =
                      new Rule(
                          sourceStartIndex,
                          sourceEndIndex,
                          minTargetIndex,
                          maxTargetIndex,
                          sourceStartIndexX,
                          sourceEndIndexX,
                          targetLimit.getFirst(),
                          targetLimit.getSecond(),
                          sourceStartIndexX2,
                          sourceEndIndexX2,
                          targetLimitX2.getFirst(),
                          targetLimitX2.getSecond(),
                          sp,
                          a);
                  res.add(r);
                }
              }
            }
          }
        }
      }
    }
    return res;
  }
Exemplo n.º 5
0
 private List<Rule> extractRulesOneNonTerminal(
     int sourceStartIndex,
     int sourceEndIndex,
     int minTargetIndex,
     int maxTargetIndex,
     Alignment a,
     SentencePair sp) {
   List<Rule> res = new ArrayList<Rule>();
   int offset = 0;
   for (int sourceStartIndexX = sourceStartIndex;
       sourceStartIndexX <= sourceEndIndex;
       sourceStartIndexX++) {
     for (int sourceEndIndexX = sourceStartIndexX;
         sourceEndIndexX <= sourceEndIndex;
         sourceEndIndexX++) {
       if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) {
         break;
       }
       Pair<Integer, Integer> targetLimit =
           updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX);
       offset = 0;
       for (int targetIndex = targetLimit.getFirst();
           targetIndex <= targetLimit.getSecond();
           targetIndex++) {
         if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) {
           offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX;
           break;
         }
         if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) {
           offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX;
           break;
         }
       }
       if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) {
         break;
       }
       if (offset < 0) {
         break;
       }
       if (offset > 0) {
         sourceEndIndexX = sourceEndIndexX + offset - 1;
       } else if (sourceStartIndexX != sourceStartIndex || sourceEndIndexX != sourceEndIndex) {
         if (filterPassOneNonTerminalRule(
             sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX)) {
           Rule r =
               new Rule(
                   sourceStartIndex,
                   sourceEndIndex,
                   minTargetIndex,
                   maxTargetIndex,
                   sourceStartIndexX,
                   sourceEndIndexX,
                   targetLimit.getFirst(),
                   targetLimit.getSecond(),
                   sp,
                   a);
           res.add(r);
         }
       }
     }
   }
   return res;
 }