/** * This method extends an extracted phrase pair with unaligned boundary words on the target side * * @param sourceStartIndex * @param sourceEndIndex * @param targetStartIndex * @param targetEndIndex * @param a * @param sp * @return */ private List<Rule> extendUnalignedBoundaryWord( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); int prev = 0; // number of previous target unaligned words int targetExtendIndex = targetStartIndex - 1; // i>0 because i=0 is a position reserved for NULL. In this version, we // don't make // use of the alignment to NULL because we use symmetrized alignments // but that could // be changed while (targetExtendIndex >= 0 && !a.isTargetAligned(targetExtendIndex)) { prev++; Rule r = new Rule(sourceStartIndex, sourceEndIndex, targetExtendIndex, targetEndIndex, sp, a); res.add(r); targetExtendIndex--; } int foll = 0; // number of following target unaligned words targetExtendIndex = targetEndIndex + 1; while (targetExtendIndex < sp.getTarget().getWords().length && !a.isTargetAligned(targetExtendIndex)) { foll++; Rule r = new Rule(sourceStartIndex, sourceEndIndex, targetStartIndex, targetExtendIndex, sp, a); res.add(r); targetExtendIndex++; } if (prev > 0 && foll > 0) { // if there are unaligned words in both // sides: for (targetExtendIndex = 1; targetExtendIndex <= prev; targetExtendIndex++) { // for // each // start // (including // at // least // one // previous) int start = targetStartIndex - targetExtendIndex; for (int k = 1; k <= foll; k++) { // for each end (including at // least one following) int end = targetEndIndex + k; Rule r = new Rule(sourceStartIndex, sourceEndIndex, start, end, sp, a); res.add(r); } } } return res; }
private Pair<Integer, Integer> updateTargetLimit( Alignment a, int sourceStartIndex, int sourceEndIndex) { Pair<Integer, Integer> res = new Pair<Integer, Integer>(); int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { int minTargetIndexCandidate = a.getMinTargetIndex(sourceIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; } res.setFirst(minTargetIndex); res.setSecond(maxTargetIndex); return res; }
private List<Block> getRegularBlocks(Alignment a, SentencePair sp) { List<Block> res = new ArrayList<Block>(); int sourceStartIndex = 0; int sourceEndIndex = 0; int targetStartIndex = 0; int targetEndIndex = 0; while (sourceStartIndex <= sp.getSource().getWords().length && targetStartIndex <= sp.getTarget().getWords().length) { Block next = findNextBlock(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex, a, sp); sourceStartIndex = next.sourceStartIndex; sourceEndIndex = next.sourceEndIndex; targetStartIndex = next.targetStartIndex; targetEndIndex = next.targetEndIndex; if (targetStartIndex >= sp.getTarget().getWords().length && sourceStartIndex >= sp.getSource().getWords().length) { // do nothing } else if (targetStartIndex < sp.getTarget().getWords().length && !a.isTargetAligned(targetStartIndex)) { sourceEndIndex--; } else if (sourceStartIndex < sp.getSource().getWords().length && !a.isSourceAligned(sourceStartIndex)) { targetEndIndex--; } if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length) { break; } if (sourceStartIndex <= sourceEndIndex && targetStartIndex <= targetEndIndex) { res.add(new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex)); } sourceStartIndex = sourceEndIndex + 1; targetStartIndex = targetEndIndex + 1; sourceEndIndex = sourceStartIndex; targetEndIndex = targetStartIndex; } return res; }
private Block findNextBlock( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length || !a.isTargetAligned(targetStartIndex) || !a.isSourceAligned(sourceStartIndex)) { return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); } boolean cont = true; while (cont) { cont = false; for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) { targetEndIndex = a.getMaxTargetIndex(sourceIndex); cont = true; } } for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) { if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { sourceEndIndex = a.getMaxSourceIndex(targetIndex); cont = true; } } } return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); }
/** * @param sourceStartIndex * @param sourceEndIndex * @param sourceStartIndexX * @param sourceEndIndexX * @param sourceStartIndexX2 * @param sourceEndIndexX2 * @return */ private boolean filterPassTwoNonTerminalRule( int sourceStartIndex, int sourceEndIndex, int sourceStartIndexX, int sourceEndIndexX, int sourceStartIndexX2, int sourceEndIndexX2, Alignment a) { boolean middleTerminalAligned = false; for (int sourceIndex = sourceEndIndexX + 1; sourceIndex < sourceStartIndexX2; sourceIndex++) { if (a.isSourceAligned(sourceIndex)) { middleTerminalAligned = true; break; } } boolean res = (middleTerminalAligned && ((sourceEndIndex - sourceStartIndex + 1) - (sourceEndIndexX - sourceStartIndexX + 1) - (sourceEndIndexX2 - sourceStartIndexX2 + 1) + 2) <= MAX_SOURCE_ELEMENTS && sourceStartIndexX - sourceStartIndex <= MAX_TERMINAL_LENGTH && sourceStartIndexX2 - sourceEndIndexX <= MAX_TERMINAL_LENGTH && sourceEndIndex - sourceEndIndexX2 <= MAX_TERMINAL_LENGTH && sourceEndIndexX - sourceStartIndexX + 1 <= MAX_NONTERMINAL_SPAN && sourceEndIndexX2 - sourceStartIndexX2 + 1 <= MAX_NONTERMINAL_SPAN); if (!res) { return res; } if (REMOVE_MONOTONIC_REPEATS && isMonotonicRepeatTwoNonterminal( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX, sourceStartIndexX2, sourceEndIndexX2)) { return false; } return true; }
private List<Rule> extractRulesTwoNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); if (sourceEndIndex - sourceStartIndex < 2) { // we want at least 2 // source words to // extract // a rule with two // nonterminals return res; } for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX < sourceEndIndex - 1; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX < sourceEndIndex - 1; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) break; Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); int offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } else if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else { for (int sourceStartIndexX2 = sourceEndIndexX + 2; sourceStartIndexX2 <= sourceEndIndex; sourceStartIndexX2++) { for (int sourceEndIndexX2 = sourceStartIndexX2; sourceEndIndexX2 <= sourceEndIndex; sourceEndIndexX2++) { if (!a.isSourceAligned(sourceStartIndexX2) || !a.isSourceAligned(sourceEndIndexX2)) { break; } Pair<Integer, Integer> targetLimitX2 = updateTargetLimit(a, sourceStartIndexX2, sourceEndIndexX2); int offset2 = 0; for (int targetIndex = targetLimitX2.getFirst(); targetIndex <= targetLimitX2.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX2) { offset2 = a.getMinSourceIndex(targetIndex) - sourceStartIndexX2; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX2) { offset2 = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX2; break; } } if (offset2 < 0) break; else if (offset2 > 0) { sourceEndIndexX2 = sourceEndIndexX2 + offset2 - 1; } else { if (filterPassTwoNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX, sourceStartIndexX2, sourceEndIndexX2, a)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sourceStartIndexX2, sourceEndIndexX2, targetLimitX2.getFirst(), targetLimitX2.getSecond(), sp, a); res.add(r); } } } } } } } return res; }
private List<Rule> extractRulesOneNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); int offset = 0; for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX <= sourceEndIndex; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX <= sourceEndIndex; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) { break; } Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else if (sourceStartIndexX != sourceStartIndex || sourceEndIndexX != sourceEndIndex) { if (filterPassOneNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sp, a); res.add(r); } } } } return res; }
private List<Rule> extractInternalBlockRules(Block regularBlock, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); for (int sourceStartIndex = regularBlock.sourceStartIndex; sourceStartIndex <= regularBlock.sourceEndIndex; sourceStartIndex++) { for (int sourceEndIndex = sourceStartIndex + 1; sourceEndIndex <= regularBlock.sourceEndIndex; sourceEndIndex++) { // check links and update target limit Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndex, sourceEndIndex); int offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndex) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndex; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndex; break; } } if (offset < 0) { break; // if negative offset, jump to another // sourceStartIndex (end the sourceEndIndex for // loop) } else if (offset > 0) { sourceEndIndex = sourceEndIndex + offset - 1; // if // positive, // add // offset // to jump // to // the // adequate // sourceEndIndex } else { // zero offset, found a plausible subregion // System.err.println("Extracting rules one nonterminal " // + // sourceStartIndex + " " + sourceEndIndex + " " + // minTargetIndex + " " + maxTargetIndex); res.addAll( extractRulesOneNonTerminal( sourceStartIndex, sourceEndIndex, targetLimit.getFirst(), targetLimit.getSecond(), a, sp)); res.addAll( extractRulesTwoNonTerminal( sourceStartIndex, sourceEndIndex, targetLimit.getFirst(), targetLimit.getSecond(), a, sp)); } } } return res; }
/** * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for * testing. * * @param a * @param sp * @return */ private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); // loop over source index (beginning of phrase) for (int sourceStartIndex = 0; sourceStartIndex < sp.getSource().getWords().length; sourceStartIndex++) { // source phrase built on the fly List<Integer> sourcePhrase = new ArrayList<Integer>(); // maintain the minimum and maximum target index aligned to the // source phrase int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); // loop over source index (end of phrase) for (int sourceEndIndex = sourceStartIndex; sourceEndIndex < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length); sourceEndIndex++) { // update the sourcePhrase sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]); // update minimum and maximum target index aligned to the source // phrase int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; if (minTargetIndex > maxTargetIndex) // occurs when haven't // found any aligned // word // yet continue; // check if the target phrase between positions minTargetIndex // and maxTargetIndex and the source phrase are consistent with // the alignment boolean consistent = true; List<Integer> targetPhrase = new ArrayList<Integer>(); for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) { targetPhrase.add(sp.getTarget().getWords()[targetIndex]); if (a.isTargetAligned(targetIndex) && (a.getMinSourceIndex(targetIndex) < sourceStartIndex || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) { consistent = false; break; } } // we found a phrase pair if (consistent) { // TODO the rule may be constructed on the fly // to avoid duplicated logic as well as duplicated loops. Rule r = new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a); res.add(r); List<Rule> extendedUnaligned = extendUnalignedBoundaryWord( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp); res.addAll(extendedUnaligned); } } } return res; }