// STAGE 1 : apply word level alignment for each segment private void wordAlignSubsequences() { String[] refTokens = refIndex.getTokens(); String[] candTokens = ocrIndex.getTokens(); EditDistAligner aligner = new EditDistAligner(); int startRef = 0, endRef = 0, startCand = 0, endCand = 0; // now align each segment using dynamic programming for (int j = 0; j <= anchorsRef.size(); j++) { // copy segment if (j == anchorsRef.size()) { endRef = refIndex.getNumOfTokens(); endCand = ocrIndex.getNumOfTokens(); } else { endRef = anchorsRef.get(j).getPosInt(); endCand = anchorsCand.get(j).getPosInt(); } // if the segment is larger than the largest segment size, then do not align the sequence long dynamicTableSize = ((long) (endRef - startRef) * (long) (endCand - startCand)); if (dynamicTableSize != 0 && dynamicTableSize <= MAX_DYNAMIC_TABLE_SIZE) { // run dynamic programming ArrayList<AlignedSequence> sq = aligner.align(refTokens, candTokens, startRef, endRef, startCand, endCand); alignment.addAll(sq); } else { for (int ls = startRef; ls < endRef; ls++) { alignment.add(new AlignedSequence(refTokens[ls], null)); } for (int ls = startCand; ls < endCand; ls++) { alignment.add(new AlignedSequence(null, candTokens[ls])); } } startRef = endRef; startCand = endCand; } }
// STAGE 2: based on word level alignment, align all the characters // Michael Z 7/2013 Modified to avoid ArrayIndexOutOfBoundsException when there // are long streteches without an anchor word. To avoid that, we only build the "accumulator" // array when we need it for the alignment and we're assured all the words will fit in the array. private void charAlignSubsequences() { int EXPECTED_WORD_LENGTH_IN_CHARACTERS = 5; // for memory pre-allocation ArrayList<AlignedSequence> alignment2 = new ArrayList<AlignedSequence>(alignment.size() * EXPECTED_WORD_LENGTH_IN_CHARACTERS); EditDistAligner aligner = new EditDistAligner(); String candAccu[] = new String[(int) MAX_DYNAMIC_TABLE_SIZE]; String refAccu[] = new String[(int) MAX_DYNAMIC_TABLE_SIZE]; int candAccuSize = 0; int refAccuSize = 0; int refStartAnchorIdx = -1; int candStartAnchorIdx = -1; String ch; String cand, ref; for (int i = 0; i <= alignment.size(); i++) { if (alignment.size() == i) { // dont forget the last text segment cand = ""; ref = ""; } else { AlignedSequence cur = alignment.get(i); cand = cur.m_candidate; ref = cur.m_reference; } // save the index of the first "anchor" so we // know where to start the alignment below if (cand != null && candStartAnchorIdx == -1) { candStartAnchorIdx = i; } if (ref != null && refStartAnchorIdx == -1) { refStartAnchorIdx = i; } // if either is null, skip the alignment step, we only align // when the reference word is the same as the candidate word. if (ref == null || cand == null) { // only one of the words can be null at a time. if (cand != null) { candAccuSize += cand.length() + 1; // +1 for the space after the word } else { // ref != null refAccuSize += ref.length() + 1; } continue; } // if we got here, we have two words if (!cand.equals(ref)) { // words don't match, increment the character accumulators // and skip trying to align until the words match refAccuSize += ref.length() + 1; // +1 for the space after the word candAccuSize += cand.length() + 1; continue; } // we only do an alignment if it's small enough to do efficiently boolean blnUseDynamicAlignment = ((long) refAccuSize * (long) candAccuSize < MAX_DYNAMIC_TABLE_SIZE) && refAccuSize != 0 && candAccuSize != 0; // build the array to pass to the aligner or just null align int tmpAccuSize = 0; for (int kk = refStartAnchorIdx; kk < i; kk++) { if (alignment.get(kk).m_reference == null) { continue; // only process non-null words } String tmpWord = alignment.get(kk).m_reference + " "; // add space at end of word for (int charIdx = 0; charIdx < tmpWord.length(); charIdx++) { if (blnUseDynamicAlignment) { refAccu[tmpAccuSize++] = Character.toString(tmpWord.charAt(charIdx)); } else { alignment2.add(new AlignedSequence(tmpWord.charAt(charIdx), null)); } } } tmpAccuSize = 0; for (int kk = candStartAnchorIdx; kk < i; kk++) { if (alignment.get(kk).m_candidate == null) { continue; } String tmpWord = alignment.get(kk).m_candidate + " "; // add space at end of word for (int charIdx = 0; charIdx < tmpWord.length(); charIdx++) { if (blnUseDynamicAlignment) { candAccu[tmpAccuSize++] = Character.toString(tmpWord.charAt(charIdx)); } else { alignment2.add(new AlignedSequence(null, tmpWord.charAt(charIdx))); } } } // add it to the resulting sequence if if (blnUseDynamicAlignment) { ArrayList<AlignedSequence> sq = aligner.align(refAccu, candAccu, 0, refAccuSize, 0, candAccuSize); alignment2.addAll(sq); } // we know that current words align for (int j = 0; j < ref.length(); j++) { String cha = ref.substring(j, j + 1); alignment2.add(new AlignedSequence(cha, cha)); } alignment2.add(new AlignedSequence(" ", " ")); // reset accumulators & anchor indexes refAccuSize = 0; candAccuSize = 0; refStartAnchorIdx = -1; candStartAnchorIdx = -1; } // end loop through the word level alignment // we always add a space after a word, and the last run through uses // an empty string for both the cand and ref word to make sure we "fall through" // to the logic and process any words still waiting to be aligned. // Now that we're done, remove the last two alignments. Without this, there // are extra spaces in the alignment that don't exist in the original works and // a sanity check such as checkCharLevelAlignment() will fail. alignment2.remove(alignment2.size() - 1); alignment2.remove(alignment2.size() - 1); alignment = alignment2; }