private Block findNextBlock( int sourceStartIndex, int sourceEndIndex, int targetStartIndex, int targetEndIndex, Alignment a, SentencePair sp) { if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length || !a.isTargetAligned(targetStartIndex) || !a.isSourceAligned(sourceStartIndex)) { return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); } boolean cont = true; while (cont) { cont = false; for (int sourceIndex = sourceStartIndex; sourceIndex <= sourceEndIndex; sourceIndex++) { if (a.getMaxTargetIndex(sourceIndex) > targetEndIndex) { targetEndIndex = a.getMaxTargetIndex(sourceIndex); cont = true; } } for (int targetIndex = targetStartIndex; targetIndex <= targetEndIndex; targetIndex++) { if (a.getMaxSourceIndex(targetIndex) > sourceEndIndex) { sourceEndIndex = a.getMaxSourceIndex(targetIndex); cont = true; } } } return new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex); }
/** * @param sourceStartIndex * @param sourceEndIndex * @param sourceStartIndexX * @param sourceEndIndexX * @param sourceStartIndexX2 * @param sourceEndIndexX2 * @return */ private boolean filterPassTwoNonTerminalRule( int sourceStartIndex, int sourceEndIndex, int sourceStartIndexX, int sourceEndIndexX, int sourceStartIndexX2, int sourceEndIndexX2, Alignment a) { boolean middleTerminalAligned = false; for (int sourceIndex = sourceEndIndexX + 1; sourceIndex < sourceStartIndexX2; sourceIndex++) { if (a.isSourceAligned(sourceIndex)) { middleTerminalAligned = true; break; } } boolean res = (middleTerminalAligned && ((sourceEndIndex - sourceStartIndex + 1) - (sourceEndIndexX - sourceStartIndexX + 1) - (sourceEndIndexX2 - sourceStartIndexX2 + 1) + 2) <= MAX_SOURCE_ELEMENTS && sourceStartIndexX - sourceStartIndex <= MAX_TERMINAL_LENGTH && sourceStartIndexX2 - sourceEndIndexX <= MAX_TERMINAL_LENGTH && sourceEndIndex - sourceEndIndexX2 <= MAX_TERMINAL_LENGTH && sourceEndIndexX - sourceStartIndexX + 1 <= MAX_NONTERMINAL_SPAN && sourceEndIndexX2 - sourceStartIndexX2 + 1 <= MAX_NONTERMINAL_SPAN); if (!res) { return res; } if (REMOVE_MONOTONIC_REPEATS && isMonotonicRepeatTwoNonterminal( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX, sourceStartIndexX2, sourceEndIndexX2)) { return false; } return true; }
private List<Block> getRegularBlocks(Alignment a, SentencePair sp) { List<Block> res = new ArrayList<Block>(); int sourceStartIndex = 0; int sourceEndIndex = 0; int targetStartIndex = 0; int targetEndIndex = 0; while (sourceStartIndex <= sp.getSource().getWords().length && targetStartIndex <= sp.getTarget().getWords().length) { Block next = findNextBlock(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex, a, sp); sourceStartIndex = next.sourceStartIndex; sourceEndIndex = next.sourceEndIndex; targetStartIndex = next.targetStartIndex; targetEndIndex = next.targetEndIndex; if (targetStartIndex >= sp.getTarget().getWords().length && sourceStartIndex >= sp.getSource().getWords().length) { // do nothing } else if (targetStartIndex < sp.getTarget().getWords().length && !a.isTargetAligned(targetStartIndex)) { sourceEndIndex--; } else if (sourceStartIndex < sp.getSource().getWords().length && !a.isSourceAligned(sourceStartIndex)) { targetEndIndex--; } if (targetStartIndex >= sp.getTarget().getWords().length || sourceStartIndex >= sp.getSource().getWords().length) { break; } if (sourceStartIndex <= sourceEndIndex && targetStartIndex <= targetEndIndex) { res.add(new Block(sourceStartIndex, sourceEndIndex, targetStartIndex, targetEndIndex)); } sourceStartIndex = sourceEndIndex + 1; targetStartIndex = targetEndIndex + 1; sourceEndIndex = sourceStartIndex; targetEndIndex = targetStartIndex; } return res; }
private List<Rule> extractRulesTwoNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); if (sourceEndIndex - sourceStartIndex < 2) { // we want at least 2 // source words to // extract // a rule with two // nonterminals return res; } for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX < sourceEndIndex - 1; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX < sourceEndIndex - 1; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) break; Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); int offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } else if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else { for (int sourceStartIndexX2 = sourceEndIndexX + 2; sourceStartIndexX2 <= sourceEndIndex; sourceStartIndexX2++) { for (int sourceEndIndexX2 = sourceStartIndexX2; sourceEndIndexX2 <= sourceEndIndex; sourceEndIndexX2++) { if (!a.isSourceAligned(sourceStartIndexX2) || !a.isSourceAligned(sourceEndIndexX2)) { break; } Pair<Integer, Integer> targetLimitX2 = updateTargetLimit(a, sourceStartIndexX2, sourceEndIndexX2); int offset2 = 0; for (int targetIndex = targetLimitX2.getFirst(); targetIndex <= targetLimitX2.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX2) { offset2 = a.getMinSourceIndex(targetIndex) - sourceStartIndexX2; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX2) { offset2 = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX2; break; } } if (offset2 < 0) break; else if (offset2 > 0) { sourceEndIndexX2 = sourceEndIndexX2 + offset2 - 1; } else { if (filterPassTwoNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX, sourceStartIndexX2, sourceEndIndexX2, a)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sourceStartIndexX2, sourceEndIndexX2, targetLimitX2.getFirst(), targetLimitX2.getSecond(), sp, a); res.add(r); } } } } } } } return res; }
private List<Rule> extractRulesOneNonTerminal( int sourceStartIndex, int sourceEndIndex, int minTargetIndex, int maxTargetIndex, Alignment a, SentencePair sp) { List<Rule> res = new ArrayList<Rule>(); int offset = 0; for (int sourceStartIndexX = sourceStartIndex; sourceStartIndexX <= sourceEndIndex; sourceStartIndexX++) { for (int sourceEndIndexX = sourceStartIndexX; sourceEndIndexX <= sourceEndIndex; sourceEndIndexX++) { if (!a.isSourceAligned(sourceStartIndexX) || !a.isSourceAligned(sourceEndIndexX)) { break; } Pair<Integer, Integer> targetLimit = updateTargetLimit(a, sourceStartIndexX, sourceEndIndexX); offset = 0; for (int targetIndex = targetLimit.getFirst(); targetIndex <= targetLimit.getSecond(); targetIndex++) { if (a.getMinSourceIndex(targetIndex) < sourceStartIndexX) { offset = a.getMinSourceIndex(targetIndex) - sourceStartIndexX; break; } if (a.getMaxSourceIndex(targetIndex) > sourceEndIndexX) { offset = a.getMaxSourceIndex(targetIndex) - sourceEndIndexX; break; } } if (targetLimit.getFirst() == minTargetIndex && targetLimit.getSecond() == maxTargetIndex) { break; } if (offset < 0) { break; } if (offset > 0) { sourceEndIndexX = sourceEndIndexX + offset - 1; } else if (sourceStartIndexX != sourceStartIndex || sourceEndIndexX != sourceEndIndex) { if (filterPassOneNonTerminalRule( sourceStartIndex, sourceEndIndex, sourceStartIndexX, sourceEndIndexX)) { Rule r = new Rule( sourceStartIndex, sourceEndIndex, minTargetIndex, maxTargetIndex, sourceStartIndexX, sourceEndIndexX, targetLimit.getFirst(), targetLimit.getSecond(), sp, a); res.add(r); } } } } return res; }