/** * Set the main segments of the document based on the full text parsing results * * @param doc a document * @param labeledResult string * @param tokenizations tokens * @return a document */ public static Document resultSegmentation( Document doc, String labeledResult, List<String> tokenizations) { if (doc == null) { throw new NullPointerException("Document is null"); } if (doc.getBlocks() == null) { throw new NullPointerException("Blocks of the documents are null"); } // System.out.println(tokenizations.toString()); // int i = 0; // boolean first = true; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>(); doc.setBibDataSets(new ArrayList<BibDataSet>()); // StringTokenizer st = new StringTokenizer(labeledResult, "\n"); String[] lines = labeledResult.split("\n"); String currentTag = null; String s2 = null; String lastTag = null; String lastPlainTag = null; int p = 0; // index in the results' tokenization (st) int blockIndex = 0; BibDataSet bib = null; DocumentPointer pointerA = null; // DocumentPointer pointerB = null; DocumentPointer currentPointer; DocumentPointer lastPointer = null; for (String line : lines) { // while (st.hasMoreTokens()) { for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) { // int startTok = doc.getBlocks().get(blockIndex).getStartToken(); int endTok = doc.getBlocks().get(blockIndex).getEndToken(); if (endTok >= p) { break; } } ArrayList<String> localFeatures = new ArrayList<String>(); boolean addSpace = false; // String tok = st.nextToken().trim(); line = line.trim(); StringTokenizer stt = new StringTokenizer(line, "\t"); int j = 0; boolean newLine = false; int ll = stt.countTokens(); while (stt.hasMoreTokens()) { String s = stt.nextToken().trim(); if (j == 0) { s2 = s; boolean strop = false; while ((!strop) && (p < tokenizations.size())) { String tokOriginal = tokenizations.get(p); if (tokOriginal.equals(" ") | tokOriginal.equals("\n") | tokOriginal.equals("\r") | tokOriginal.equals("\t")) { addSpace = true; p++; } else if (tokOriginal.equals("")) { p++; } else // if (tokOriginal.equals(s)) { strop = true; } } } else if (j == ll - 1) { currentTag = s; // current tag } else { if (s.equals("LINESTART")) { newLine = true; } localFeatures.add(s); } j++; } if (lastTag != null) { if (lastTag.startsWith("I-")) { lastPlainTag = lastTag.substring(2, lastTag.length()); } else { lastPlainTag = lastTag; } } String currentPlainTag = null; if (currentTag != null) { if (currentTag.startsWith("I-")) { currentPlainTag = currentTag.substring(2, currentTag.length()); } else { currentPlainTag = currentTag; } } currentPointer = new DocumentPointer(doc, blockIndex, p); if (lastPlainTag != null && !currentPlainTag.equals(lastPlainTag) && lastPlainTag.equals("<references>")) { blockReferences.add(new DocumentPiece(pointerA, lastPointer)); pointerA = currentPointer; } if (currentPlainTag.equals("<header>")) { if (!blockDocumentHeaders.contains(blockIndex)) { blockDocumentHeaders.add(blockIndex); // System.out.println("add block header: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals( "<references>")) { // if (!blockReferences.contains(blockIndex)) { // blockReferences.add(blockIndex); // //System.out.println("add block reference: " + // blockIndexInteger.intValue()); // } if (currentTag.equals("I-<references>")) { pointerA = new DocumentPointer(doc, blockIndex, p); if (bib != null) { if (bib.getRawBib() != null) { doc.getBibDataSets().add(bib); bib = new BibDataSet(); } } else { bib = new BibDataSet(); } bib.setRawBib(s2); } else { if (addSpace) { if (bib == null) { bib = new BibDataSet(); bib.setRawBib(" " + s2); } else { bib.setRawBib(bib.getRawBib() + " " + s2); } } else { if (bib == null) { bib = new BibDataSet(); bib.setRawBib(s2); } else { bib.setRawBib(bib.getRawBib() + s2); } } } // case "<reference_marker>": // if (!blockReferences.contains(blockIndex)) { // blockReferences.add(blockIndex); // //System.out.println("add block reference: " + // blockIndexInteger.intValue()); // } // // if (currentTag.equals("I-<reference_marker>")) { // if (bib != null) { // if (bib.getRefSymbol() != null) { // doc.getBibDataSets().add(bib); // bib = new BibDataSet(); // } // } else { // bib = new BibDataSet(); // } // bib.setRefSymbol(s2); // } else { // if (addSpace) { // if (bib == null) { // bib = new BibDataSet(); // bib.setRefSymbol(s2); // } else { // bib.setRefSymbol(bib.getRefSymbol() + " " + s2); // } // } else { // if (bib == null) { // bib = new BibDataSet(); // bib.setRefSymbol(s2); // } else { // bib.setRefSymbol(bib.getRefSymbol() + s2); // } // } // } // break; } else if (currentPlainTag.equals("<page_footnote>")) { if (!blockFooters.contains(blockIndex)) { blockFooters.add(blockIndex); // System.out.println("add block foot note: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals("<page_header>")) { if (!blockHeaders.contains(blockIndex)) { blockHeaders.add(blockIndex); // System.out.println("add block page header: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals("<section>")) { if (!blockSectionTitles.contains(blockIndex)) { blockSectionTitles.add(blockIndex); // System.out.println("add block page header: " + blockIndexInteger.intValue()); } } lastTag = currentTag; p++; lastPointer = currentPointer; } if (bib != null) { doc.getBibDataSets().add(bib); } if (!lastPointer.equals(pointerA)) { if (lastPlainTag.equals("<references>")) { blockReferences.add(new DocumentPiece(pointerA, lastPointer)); } } doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockDocumentHeaders(blockDocumentHeaders); doc.setBlockReferences(blockReferences); doc.setBlockSectionTitles(blockSectionTitles); return doc; }
public static Document generalResultSegmentation( Document doc, String labeledResult, List<LayoutToken> documentTokens) { List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult); SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create(); doc.setLabeledBlocks(labeledBlocks); /*try { FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult); FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString()); } catch(Exception e) { e.printStackTrace(); }*/ List<Block> docBlocks = doc.getBlocks(); int indexLine = 0; int blockIndex = 0; int p = 0; // position in the labeled result int currentLineEndPos = 0; // position in the global doc. tokenization of the last // token of the current line int currentLineStartPos = 0; // position in the global doc. // tokenization of the first token of the current line String line = null; DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER; DocumentPointer currentPointer = null; DocumentPointer lastPointer = null; String curLabel; String curPlainLabel = null; String lastPlainLabel = null; int lastTokenInd = -1; for (int i = docBlocks.size() - 1; i >= 0; i--) { int endToken = docBlocks.get(i).getEndToken(); if (endToken != -1) { lastTokenInd = endToken; break; } } // we do this concatenation trick so that we don't have to process stuff after the main loop // no copying of lists happens because of this, so it's ok to concatenate String ignoredLabel = "@IGNORED_LABEL@"; for (Pair<String, String> labeledTokenPair : Iterables.concat( labeledTokens, Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) { if (labeledTokenPair == null) { p++; continue; } // as we process the document segmentation line by line, we don't use the usual // tokenization to rebuild the text flow, but we get each line again from the // text stored in the document blocks (similarly as when generating the features) line = null; while ((line == null) && (blockIndex < docBlocks.size())) { Block block = docBlocks.get(blockIndex); List<LayoutToken> tokens = block.getTokens(); String localText = block.getText(); if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } String[] lines = localText.split("[\\n\\r]"); if ((lines.length == 0) || (indexLine >= lines.length)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } else { line = lines[indexLine]; indexLine++; if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) { line = null; continue; } if (currentLineStartPos > lastTokenInd) continue; // adjust the start token position in documentTokens to this non trivial line // first skip possible space characters and tabs at the beginning of the line while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) { while (currentLineStartPos < block.getEndToken()) { if (documentTokens.get(currentLineStartPos).t().equals("\n") || documentTokens.get(currentLineStartPos).t().equals("\r")) { // move to the start of the next line, but ignore space characters and tabs currentLineStartPos++; while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if ((currentLineStartPos != lastTokenInd) && labeledTokenPair.a.startsWith( documentTokens.get(currentLineStartPos).getText())) { break; } } currentLineStartPos++; } } // what is then the position of the last token of this line? currentLineEndPos = currentLineStartPos; while (currentLineEndPos < block.getEndToken()) { if (documentTokens.get(currentLineEndPos).t().equals("\n") || documentTokens.get(currentLineEndPos).t().equals("\r")) { currentLineEndPos--; break; } currentLineEndPos++; } } } curLabel = labeledTokenPair.b; curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel); /*System.out.println("-------------------------------"); System.out.println("block: " + blockIndex); System.out.println("line: " + line); System.out.println("token: " + labeledTokenPair.a); System.out.println("curPlainLabel: " + curPlainLabel); System.out.println("lastPlainLabel: " + lastPlainLabel); if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1)) System.out.println("currentLineStartPos: " + currentLineStartPos + " (" + documentTokens.get(currentLineStartPos) + ")"); if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1)) System.out.println("currentLineEndPos: " + currentLineEndPos + " (" + documentTokens.get(currentLineEndPos) + ")");*/ if (blockIndex == docBlocks.size()) { break; } currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos); // either a new entity starts or a new beginning of the same type of entity if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); } pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } // updating stuff for next iteration lastPlainLabel = curPlainLabel; lastPointer = currentPointer; currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line p++; } if (blockIndex == docBlocks.size()) { // the last labelled piece has still to be added if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } } } return doc; }