private List<Rule> extractRulesTwoNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); if (sourceEndIndex - sourceStartIndex < 2) { // we want at least 2 // source words to // extract // a rule with two // nonterminals return res; } for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX < sourceEndIndex - 1; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX < sourceEndIndex - 1; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) break; Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); int offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } else if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else { for (int sourceStartIndexX2 = sourceEndIndexX + 2; sourceStartIndexX2 <= sourceEndIndex; sourceStartIndexX2++) { for (int sourceEndIndexX2 = sourceStartIndexX2; sourceEndIndexX2 <= sourceEndIndex; sourceEndIndexX2++) { if (!a.isSourceAligned(sourceStartIndexX2) || !a.isSourceAligned(sourceEndIndexX2)) { break; } Pair<Integer, Integer> targetLimitX2 = updateTargetLimit(a, sourceStartIndexX2, sourceEndIndexX2); int offset2 = 0; for (int targetIndex = targetLimitX2.getFirst(); targetIndex <= targetLimitX2.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX2) { offset2 = a.getMinSourceIndex(targetIndex) - sourceStartIndexX2; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX2) { offset2 = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX2; break; } } if (offset2 < 0) break; else if (offset2 > 0) { sourceEndIndexX2 = sourceEndIndexX2 + offset2 - 1; } else { if (filterPassTwoNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX, sourceStartIndexX2, sourceEndIndexX2, a)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sourceStartIndexX2, sourceEndIndexX2, targetLimitX2.getFirst(), targetLimitX2.getSecond(), sp, a); res.add(r); } } } } } } } return res; }
private List<Rule> extractInternalBlockRules(Block regularBlock, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); for (int sourceStartIndex = regularBlock.sourceStartIndex; sourceStartIndex <= regularBlock.sourceEndIndex; sourceStartIndex++) { for (int sourceEndIndex = sourceStartIndex + 1; sourceEndIndex <= regularBlock.sourceEndIndex; sourceEndIndex++) { // check links and update target limit Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndex, sourceEndIndex); int offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndex) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndex; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndex; break; } } if (offset < 0) { break; // if negative offset, jump to another // sourceStartIndex (end the sourceEndIndex for // loop) } else if (offset > 0) { sourceEndIndex = sourceEndIndex + offset - 1; // if // positive, // add // offset // to jump // to // the // adequate // sourceEndIndex } else { // zero offset, found a plausible subregion // System.err.println("Extracting rules one nonterminal " // + // sourceStartIndex + " " + sourceEndIndex + " " + // minTargetIndex + " " + maxTargetIndex); res.addAll( extractRulesOneNonTerminal( sourceStartIndex, sourceEndIndex, targetLimit.getFirst(), targetLimit.getSecond(), a, sp)); res.addAll( extractRulesTwoNonTerminal( sourceStartIndex, sourceEndIndex, targetLimit.getFirst(), targetLimit.getSecond(), a, sp)); } } } return res; }
private List<Rule> extractRulesOneNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); int offset = 0; for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX <= sourceEndIndex; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX <= sourceEndIndex; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) { break; } Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else if (sourceStartIndexX != sourceStartIndex || sourceEndIndexX != sourceEndIndex) { if (filterPassOneNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sp, a); res.add(r); } } } } return res; }
/** * This method extract phrase pairs from a Viterbi alignment and a sentence pair. Protected for * testing. * * @param a * @param sp * @return */ private List<Rule> extractPhrasePairs(Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); // loop over source index (beginning of phrase) for (int sourceStartIndex = 0; sourceStartIndex < sp.getSource().getWords().length; sourceStartIndex++) { // source phrase built on the fly List<Integer> sourcePhrase = new ArrayList<Integer>(); // maintain the minimum and maximum target index aligned to the // source phrase int minTargetIndex = a.getMinTargetIndex(sourceStartIndex); int maxTargetIndex = a.getMaxTargetIndex(sourceStartIndex); // loop over source index (end of phrase) for (int sourceEndIndex = sourceStartIndex; sourceEndIndex < Math.min(sourceStartIndex + MAX_SOURCE_PHRASE, sp.getSource().getWords().length); sourceEndIndex++) { // update the sourcePhrase sourcePhrase.add(sp.getSource().getWords()[sourceEndIndex]); // update minimum and maximum target index aligned to the source // phrase int minTargetIndexCandidate = a.getMinTargetIndex(sourceEndIndex); int maxTargetIndexCandidate = a.getMaxTargetIndex(sourceEndIndex); if (minTargetIndexCandidate < minTargetIndex) minTargetIndex = minTargetIndexCandidate; if (maxTargetIndexCandidate > maxTargetIndex) maxTargetIndex = maxTargetIndexCandidate; if (minTargetIndex > maxTargetIndex) // occurs when haven't // found any aligned // word // yet continue; // check if the target phrase between positions minTargetIndex // and maxTargetIndex and the source phrase are consistent with // the alignment boolean consistent = true; List<Integer> targetPhrase = new ArrayList<Integer>(); for (int targetIndex = minTargetIndex; targetIndex <= maxTargetIndex; targetIndex++) { targetPhrase.add(sp.getTarget().getWords()[targetIndex]); if (a.isTargetAligned(targetIndex) && (a.getMinSourceIndex(targetIndex) < sourceStartIndex || a.getMaxSourceIndex(targetIndex) > sourceEndIndex)) { consistent = false; break; } } // we found a phrase pair if (consistent) { // TODO the rule may be constructed on the fly // to avoid duplicated logic as well as duplicated loops. Rule r = new Rule(sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sp, a); res.add(r); List<Rule> extendedUnaligned = extendUnalignedBoundaryWord( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, a, sp); res.addAll(extendedUnaligned); } } } return res; }