private Block findNextBlock( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length || !a.isTargetAligned(targetStartIndex) || !a.isSourceAligned(sourceStartIndex)) { return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); } boolean cont = true; while (cont) { cont = false; for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) { targetEndIndex = a.getMaxTargetIndex(sourceIndex); cont = true; } } for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) { if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { sourceEndIndex = a.getMaxSourceIndex(targetIndex); cont = true; } } } return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); }
/** * This method extends an extracted phrase pair with unaligned boundary words on the target side * * @param sourceStartIndex * @param sourceEndIndex * @param targetStartIndex * @param targetEndIndex * @param a * @param sp * @return */ private List<Rule> extendUnalignedBoundaryWord( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); int prev = 0; // number of previous target unaligned words int targetExtendIndex = targetStartIndex - 1; // i>0 because i=0 is a position reserved for NULL. In this version, we // don't make // use of the alignment to NULL because we use symmetrized alignments // but that could // be changed while (targetExtendIndex >= 0 && !a.isTargetAligned(targetExtendIndex)) { prev++; Rule r = new Rule(sourceStartIndex, sourceEndIndex, targetExtendIndex, targetEndIndex, sp, a); res.add(r); targetExtendIndex--; } int foll = 0; // number of following target unaligned words targetExtendIndex = targetEndIndex + 1; while (targetExtendIndex < sp.getTarget().getWords().length && !a.isTargetAligned(targetExtendIndex)) { foll++; Rule r = new Rule(sourceStartIndex, sourceEndIndex, targetStartIndex, targetExtendIndex, sp, a); res.add(r); targetExtendIndex++; } if (prev > 0 && foll > 0) { // if there are unaligned words in both // sides: for (targetExtendIndex = 1; targetExtendIndex <= prev; targetExtendIndex++) { // for // each // start // (including // at // least // one // previous) int start = targetStartIndex - targetExtendIndex; for (int k = 1; k <= foll; k++) { // for each end (including at // least one following) int end = targetEndIndex + k; Rule r = new Rule(sourceStartIndex, sourceEndIndex, start, end, sp, a); res.add(r); } } } return res; }
private List<Block> getRegularBlocks(Alignment a, SentencePair sp) { List<Block> res = new ArrayList<Block>(); int sourceStartIndex = 0; int sourceEndIndex = 0; int targetStartIndex = 0; int targetEndIndex = 0; while (sourceStartIndex <= sp.getSource().getWords().length && targetStartIndex <= sp.getTarget().getWords().length) { Block next = findNextBlock(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex, a, sp); sourceStartIndex = next.sourceStartIndex; sourceEndIndex = next.sourceEndIndex; targetStartIndex = next.targetStartIndex; targetEndIndex = next.targetEndIndex; if (targetStartIndex >= sp.getTarget().getWords().length && sourceStartIndex >= sp.getSource().getWords().length) { // do nothing } else if (targetStartIndex < sp.getTarget().getWords().length && !a.isTargetAligned(targetStartIndex)) { sourceEndIndex--; } else if (sourceStartIndex < sp.getSource().getWords().length && !a.isSourceAligned(sourceStartIndex)) { targetEndIndex--; } if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length) { break; } if (sourceStartIndex <= sourceEndIndex && targetStartIndex <= targetEndIndex) { res.add(new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex)); } sourceStartIndex = sourceEndIndex + 1; targetStartIndex = targetEndIndex + 1; sourceEndIndex = sourceStartIndex; targetEndIndex = targetStartIndex; } return res; }
/** * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for * testing. * * @param a * @param sp * @return */ private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); // loop over source index (beginning of phrase) for (int sourceStartIndex = 0; sourceStartIndex < sp.getSource().getWords().length; sourceStartIndex++) { // source phrase built on the fly List<Integer> sourcePhrase = new ArrayList<Integer>(); // maintain the minimum and maximum target index aligned to the // source phrase int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); // loop over source index (end of phrase) for (int sourceEndIndex = sourceStartIndex; sourceEndIndex < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length); sourceEndIndex++) { // update the sourcePhrase sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]); // update minimum and maximum target index aligned to the source // phrase int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; if (minTargetIndex > maxTargetIndex) // occurs when haven't // found any aligned // word // yet continue; // check if the target phrase between positions minTargetIndex // and maxTargetIndex and the source phrase are consistent with // the alignment boolean consistent = true; List<Integer> targetPhrase = new ArrayList<Integer>(); for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) { targetPhrase.add(sp.getTarget().getWords()[targetIndex]); if (a.isTargetAligned(targetIndex) && (a.getMinSourceIndex(targetIndex) < sourceStartIndex || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) { consistent = false; break; } } // we found a phrase pair if (consistent) { // TODO the rule may be constructed on the fly // to avoid duplicated logic as well as duplicated loops. Rule r = new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a); res.add(r); List<Rule> extendedUnaligned = extendUnalignedBoundaryWord( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp); res.addAll(extendedUnaligned); } } } return res; }