public static Document generalResultSegmentation( Document doc, String labeledResult, List<LayoutToken> documentTokens) { List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult); SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create(); doc.setLabeledBlocks(labeledBlocks); /*try { FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult); FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString()); } catch(Exception e) { e.printStackTrace(); }*/ List<Block> docBlocks = doc.getBlocks(); int indexLine = 0; int blockIndex = 0; int p = 0; // position in the labeled result int currentLineEndPos = 0; // position in the global doc. tokenization of the last // token of the current line int currentLineStartPos = 0; // position in the global doc. // tokenization of the first token of the current line String line = null; DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER; DocumentPointer currentPointer = null; DocumentPointer lastPointer = null; String curLabel; String curPlainLabel = null; String lastPlainLabel = null; int lastTokenInd = -1; for (int i = docBlocks.size() - 1; i >= 0; i--) { int endToken = docBlocks.get(i).getEndToken(); if (endToken != -1) { lastTokenInd = endToken; break; } } // we do this concatenation trick so that we don't have to process stuff after the main loop // no copying of lists happens because of this, so it's ok to concatenate String ignoredLabel = "@IGNORED_LABEL@"; for (Pair<String, String> labeledTokenPair : Iterables.concat( labeledTokens, Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) { if (labeledTokenPair == null) { p++; continue; } // as we process the document segmentation line by line, we don't use the usual // tokenization to rebuild the text flow, but we get each line again from the // text stored in the document blocks (similarly as when generating the features) line = null; while ((line == null) && (blockIndex < docBlocks.size())) { Block block = docBlocks.get(blockIndex); List<LayoutToken> tokens = block.getTokens(); String localText = block.getText(); if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } String[] lines = localText.split("[\\n\\r]"); if ((lines.length == 0) || (indexLine >= lines.length)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } else { line = lines[indexLine]; indexLine++; if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) { line = null; continue; } if (currentLineStartPos > lastTokenInd) continue; // adjust the start token position in documentTokens to this non trivial line // first skip possible space characters and tabs at the beginning of the line while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) { while (currentLineStartPos < block.getEndToken()) { if (documentTokens.get(currentLineStartPos).t().equals("\n") || documentTokens.get(currentLineStartPos).t().equals("\r")) { // move to the start of the next line, but ignore space characters and tabs currentLineStartPos++; while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if ((currentLineStartPos != lastTokenInd) && labeledTokenPair.a.startsWith( documentTokens.get(currentLineStartPos).getText())) { break; } } currentLineStartPos++; } } // what is then the position of the last token of this line? currentLineEndPos = currentLineStartPos; while (currentLineEndPos < block.getEndToken()) { if (documentTokens.get(currentLineEndPos).t().equals("\n") || documentTokens.get(currentLineEndPos).t().equals("\r")) { currentLineEndPos--; break; } currentLineEndPos++; } } } curLabel = labeledTokenPair.b; curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel); /*System.out.println("-------------------------------"); System.out.println("block: " + blockIndex); System.out.println("line: " + line); System.out.println("token: " + labeledTokenPair.a); System.out.println("curPlainLabel: " + curPlainLabel); System.out.println("lastPlainLabel: " + lastPlainLabel); if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1)) System.out.println("currentLineStartPos: " + currentLineStartPos + " (" + documentTokens.get(currentLineStartPos) + ")"); if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1)) System.out.println("currentLineEndPos: " + currentLineEndPos + " (" + documentTokens.get(currentLineEndPos) + ")");*/ if (blockIndex == docBlocks.size()) { break; } currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos); // either a new entity starts or a new beginning of the same type of entity if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); } pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } // updating stuff for next iteration lastPlainLabel = curPlainLabel; lastPointer = currentPointer; currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line p++; } if (blockIndex == docBlocks.size()) { // the last labelled piece has still to be added if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } } } return doc; }
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") && // !qName.equals("TEXT")) // System.out.println(qName); if (qName.equals("TEXT")) { blabla.append("\n"); LayoutToken token = new LayoutToken(); token.setText("\n"); block.addToken(token); accumulator.setLength(0); tokenizations.add("\n"); } else if (qName.equals("METADATA")) { accumulator.setLength(0); } else if (qName.equals("TOKEN")) { String tok0 = TextUtilities.clean(getText()); if (block.getStartToken() == -1) { block.setStartToken(tokenizations.size()); } if (tok0.length() > 0) { StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true); boolean diaresis = false; boolean accent = false; boolean keepLast = false; while (st.hasMoreTokens()) { diaresis = false; accent = false; keepLast = false; String tok = st.nextToken(); if (tok.length() > 0) { LayoutToken token = new LayoutToken(); if ((previousToken != null) && (tok != null) && (previousToken.length() > 0) && (tok.length() > 0) && blabla.length() > 0) { Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1); Character rightChar = tok.charAt(0); ModifierClass leftClass = classifyChar(leftChar); ModifierClass rightClass = classifyChar(rightChar); ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER; if (leftClass != ModifierClass.NOT_A_MODIFIER || rightClass != ModifierClass.NOT_A_MODIFIER) { Character baseChar = null; Character modifierChar = null; if (leftClass != ModifierClass.NOT_A_MODIFIER) { if (rightClass != ModifierClass.NOT_A_MODIFIER) { // assert false; // keeping characters, but setting class // to not a modifier baseChar = leftChar; modifierChar = rightChar; modifierClass = ModifierClass.NOT_A_MODIFIER; } else { baseChar = rightChar; modifierChar = leftChar; modifierClass = leftClass; } } else { baseChar = leftChar; modifierChar = rightChar; modifierClass = rightClass; } String updatedChar = modifyCharacter(baseChar, modifierChar); tokenizations.remove(tokenizations.size() - 1); if (tokenizations.size() > 0) { tokenizations.remove(tokenizations.size() - 1); } blabla.deleteCharAt(blabla.length() - 1); if (blabla.length() > 0) { blabla.deleteCharAt(blabla.length() - 1); } removeLastCharacterIfPresent(previousTok); if (updatedChar != null) { blabla.append(updatedChar); previousTok.setText(previousTok.getText() + updatedChar); } blabla.append(tok.substring(1, tok.length())); previousTok.setText(previousTok.getText() + tok.substring(1, tok.length())); tokenizations.add(previousTok.getText()); diaresis = (modifierClass == ModifierClass.DIAERESIS || modifierClass == ModifierClass.NORDIC_RING || modifierClass == ModifierClass.CZECH_CARON || modifierClass == ModifierClass.TILDE || modifierClass == ModifierClass.CEDILLA); accent = (modifierClass == ModifierClass.ACUTE_ACCENT || modifierClass == ModifierClass.CIRCUMFLEX || modifierClass == ModifierClass.GRAVE_ACCENT); if (rightClass != ModifierClass.NOT_A_MODIFIER) { tok = ""; // resetting current token as it // is a single-item } } } if (tok != null) { // actually in certain cases, the extracted string under token can be a chunk of text // with separators that need to be preserved // tok = tok.replace(" ", ""); } if ((!diaresis) && (!accent)) { // blabla.append(" "); blabla.append(tok); token.setText(tok); tokenizations.add(tok); } else { tok = ""; keepLast = true; } /* * StringTokenizer st0 = new StringTokenizer(tok0, * TextUtilities.fullPunctuations, true); * while(st0.hasMoreTokens()) { String tok = * st0.nextToken(); tokenizations.add(tok); } * tokenizations.add(" "); */ /* * boolean punct1 = false; boolean punct2 = false; * boolean punct3 = false; String content = null; int i * = 0; for(; i<TextUtilities.punctuations.length(); * i++) { if (tok.length() > 0) { if * (tok.charAt(tok.length()-1) == * TextUtilities.punctuations.charAt(i)) { punct1 = * true; content = tok.substring(0, tok.length()-1); if * (tok.length() > 1) { int j = 0; for(; * j<TextUtilities.punctuations.length(); j++) { if * (tok.charAt(tok.length()-2) == * TextUtilities.punctuations.charAt(j)) { punct3 = * true; content = tok.substring(0, tok.length()-2); } } * } break; } } } if (tok.length() > 0) { if ( * (tok.startsWith("(")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("("); } else if ( * (tok.startsWith("[")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("["); } else if ( * (tok.startsWith("\"")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("\""); } } */ if (currentRotation) currentFontSize = currentFontSize / 2; /* * if (punct2) { if (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * token = new LayoutToken(); token.setText(content); } * if (punct1) { token.setText(content); if (currentFont * != null) token.setFont(currentFont.toLowerCase()); * else token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * if (punct3) { token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-2)); if * (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); } * * token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-1)); } */ if (currentFont != null) token.setFont(currentFont.toLowerCase()); else token.setFont("default"); token.setItalic(currentItalic); token.setBold(currentBold); token.setRotation(currentRotation); token.setColorFont(colorFont); token.setX(currentX); token.setY(currentY); token.setWidth(currentWidth); token.setHeight(currentHeight); token.setFontSize(currentFontSize); if (!diaresis && !accent) { block.addToken(token); } if (block.getFont() == null) { if (currentFont != null) block.setFont(currentFont.toLowerCase()); else token.setFont("default"); } if (nbTokens == 0) { block.setItalic(currentItalic); block.setBold(currentBold); } if (block.getColorFont() == null) block.setColorFont(colorFont); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize); if (!diaresis && !accent) { previousToken = tok; previousTok = token; } else { previousToken = previousTok.getText(); } nbTokens++; accumulator.setLength(0); } } if (tokenizations.size() > 0) { String justBefore = tokenizations.get(tokenizations.size() - 1); if (!justBefore.endsWith("-")) { tokenizations.add(" "); blabla.append(" "); } } } block.setEndToken(tokenizations.size()); } else if (qName.equals("PAGE")) { // page marker are usefull to detect headers (same first line(s) // appearing on each page) if (block != null) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0 = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0); block0.setPage(currentPage); doc.addBlock(block0); block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); nbTokens = 0; // blabla.append("\n@block\n"); tokenizations.add("\n"); } else if (qName.equals("IMAGE")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); if (images.size() > 0) { blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); } block.setText(blabla.toString()); block.setNbTokens(nbTokens); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) { * blabla.append("\n"); block.setText(blabla.toString()); * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new * Block(); block.setPage(currentPage); blabla = new StringBuffer(); * blabla.append("@IMAGE " + "vectorial \n"); * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) * block.setY(currentY); if (block.getWidth() == 0.0) * block.setWidth(currentWidth); if (block.getHeight() == 0.0) * block.setHeight(currentHeight); doc.addBlock(block); blabla = new * StringBuffer(); nbTokens = 0; block = new Block(); * block.setPage(currentPage); } */ else if (qName.equals("BLOCK")) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); block.setWidth(currentX - block.getX() + currentWidth); block.setHeight(currentY - block.getY() + currentHeight); doc.addBlock(block); // blabla = new StringBuffer(); nbTokens = 0; block = null; } else if (qName.equals("xi:include")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("DOCUMENT")) { * System.out.println(blabla.toString()); } */ }