public static Document generalResultSegmentation( Document doc, String labeledResult, List<LayoutToken> documentTokens) { List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult); SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create(); doc.setLabeledBlocks(labeledBlocks); /*try { FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult); FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString()); } catch(Exception e) { e.printStackTrace(); }*/ List<Block> docBlocks = doc.getBlocks(); int indexLine = 0; int blockIndex = 0; int p = 0; // position in the labeled result int currentLineEndPos = 0; // position in the global doc. tokenization of the last // token of the current line int currentLineStartPos = 0; // position in the global doc. // tokenization of the first token of the current line String line = null; DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER; DocumentPointer currentPointer = null; DocumentPointer lastPointer = null; String curLabel; String curPlainLabel = null; String lastPlainLabel = null; int lastTokenInd = -1; for (int i = docBlocks.size() - 1; i >= 0; i--) { int endToken = docBlocks.get(i).getEndToken(); if (endToken != -1) { lastTokenInd = endToken; break; } } // we do this concatenation trick so that we don't have to process stuff after the main loop // no copying of lists happens because of this, so it's ok to concatenate String ignoredLabel = "@IGNORED_LABEL@"; for (Pair<String, String> labeledTokenPair : Iterables.concat( labeledTokens, Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) { if (labeledTokenPair == null) { p++; continue; } // as we process the document segmentation line by line, we don't use the usual // tokenization to rebuild the text flow, but we get each line again from the // text stored in the document blocks (similarly as when generating the features) line = null; while ((line == null) && (blockIndex < docBlocks.size())) { Block block = docBlocks.get(blockIndex); List<LayoutToken> tokens = block.getTokens(); String localText = block.getText(); if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } String[] lines = localText.split("[\\n\\r]"); if ((lines.length == 0) || (indexLine >= lines.length)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } else { line = lines[indexLine]; indexLine++; if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) { line = null; continue; } if (currentLineStartPos > lastTokenInd) continue; // adjust the start token position in documentTokens to this non trivial line // first skip possible space characters and tabs at the beginning of the line while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) { while (currentLineStartPos < block.getEndToken()) { if (documentTokens.get(currentLineStartPos).t().equals("\n") || documentTokens.get(currentLineStartPos).t().equals("\r")) { // move to the start of the next line, but ignore space characters and tabs currentLineStartPos++; while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if ((currentLineStartPos != lastTokenInd) && labeledTokenPair.a.startsWith( documentTokens.get(currentLineStartPos).getText())) { break; } } currentLineStartPos++; } } // what is then the position of the last token of this line? currentLineEndPos = currentLineStartPos; while (currentLineEndPos < block.getEndToken()) { if (documentTokens.get(currentLineEndPos).t().equals("\n") || documentTokens.get(currentLineEndPos).t().equals("\r")) { currentLineEndPos--; break; } currentLineEndPos++; } } } curLabel = labeledTokenPair.b; curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel); /*System.out.println("-------------------------------"); System.out.println("block: " + blockIndex); System.out.println("line: " + line); System.out.println("token: " + labeledTokenPair.a); System.out.println("curPlainLabel: " + curPlainLabel); System.out.println("lastPlainLabel: " + lastPlainLabel); if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1)) System.out.println("currentLineStartPos: " + currentLineStartPos + " (" + documentTokens.get(currentLineStartPos) + ")"); if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1)) System.out.println("currentLineEndPos: " + currentLineEndPos + " (" + documentTokens.get(currentLineEndPos) + ")");*/ if (blockIndex == docBlocks.size()) { break; } currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos); // either a new entity starts or a new beginning of the same type of entity if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); } pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } // updating stuff for next iteration lastPlainLabel = curPlainLabel; lastPointer = currentPointer; currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line p++; } if (blockIndex == docBlocks.size()) { // the last labelled piece has still to be added if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } } } return doc; }