private Block findNextBlock( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length || !a.isTargetAligned(targetStartIndex) || !a.isSourceAligned(sourceStartIndex)) { return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); } boolean cont = true; while (cont) { cont = false; for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) { targetEndIndex = a.getMaxTargetIndex(sourceIndex); cont = true; } } for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) { if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { sourceEndIndex = a.getMaxSourceIndex(targetIndex); cont = true; } } } return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); }
private Pair<Integer, Integer> updateTargetLimit( Alignment a, int sourceStartIndex, int sourceEndIndex) { Pair<Integer, Integer> res = new Pair<Integer, Integer>(); int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { int minTargetIndexCandidate = a.getMinTargetIndex(sourceIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; } res.setFirst(minTargetIndex); res.setSecond(maxTargetIndex); return res; }
/** * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for * testing. * * @param a * @param sp * @return */ private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); // loop over source index (beginning of phrase) for (int sourceStartIndex = 0; sourceStartIndex < sp.getSource().getWords().length; sourceStartIndex++) { // source phrase built on the fly List<Integer> sourcePhrase = new ArrayList<Integer>(); // maintain the minimum and maximum target index aligned to the // source phrase int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); // loop over source index (end of phrase) for (int sourceEndIndex = sourceStartIndex; sourceEndIndex < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length); sourceEndIndex++) { // update the sourcePhrase sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]); // update minimum and maximum target index aligned to the source // phrase int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; if (minTargetIndex > maxTargetIndex) // occurs when haven't // found any aligned // word // yet continue; // check if the target phrase between positions minTargetIndex // and maxTargetIndex and the source phrase are consistent with // the alignment boolean consistent = true; List<Integer> targetPhrase = new ArrayList<Integer>(); for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) { targetPhrase.add(sp.getTarget().getWords()[targetIndex]); if (a.isTargetAligned(targetIndex) && (a.getMinSourceIndex(targetIndex) < sourceStartIndex || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) { consistent = false; break; } } // we found a phrase pair if (consistent) { // TODO the rule may be constructed on the fly // to avoid duplicated logic as well as duplicated loops. Rule r = new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a); res.add(r); List<Rule> extendedUnaligned = extendUnalignedBoundaryWord( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp); res.addAll(extendedUnaligned); } } } return res; }