/** * First pass to detect basic structures: remove page header/footer, identify section numbering, * identify Figure and table blocks. * * <p>-> to be removed at some point! * * @param doc a document */ public static void firstPass(Document doc) { if (doc == null) { throw new NullPointerException(); } if (doc.getBlocks() == null) { throw new NullPointerException(); } int i = 0; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); List<Integer> acknowledgementBlocks = new ArrayList<Integer>(); List<Integer> blockTables = new ArrayList<Integer>(); List<Integer> blockFigures = new ArrayList<Integer>(); List<Integer> blockHeadTables = new ArrayList<Integer>(); List<Integer> blockHeadFigures = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); doc.setTitleMatchNum(false); try { for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText); Matcher ma2 = BasicStructureBuilder.references.matcher(localText); if ((ma1.find()) || (ma2.find())) { if (((localText.startsWith("1.")) || (localText.startsWith("1 "))) || ((localText.startsWith("2.")) || (localText.startsWith("2 "))) || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true); // System.out.println("Title section identified: block " + i + ", " + localText); blockSectionTitles.add(i); } else { StringTokenizer st = new StringTokenizer(localText, "\n"); while (st.hasMoreTokens()) { String token = st.nextToken(); if (token.startsWith("@PAGE")) { // current block should give the header/footors if (i > 4) { if (doc.getBlocks().get(i - 5).getNbTokens() < 20) { Integer i2 = i - 5; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 3) { if (doc.getBlocks().get(i - 4).getNbTokens() < 20) { Integer i2 = i - 4; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 2) { if (doc.getBlocks().get(i - 3).getNbTokens() < 20) { Integer i2 = i - 3; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 1) { if (doc.getBlocks().get(i - 2).getNbTokens() < 20) { Integer i2 = i - 2; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 0) { if (doc.getBlocks().get(i - 1).getNbTokens() < 20) { Integer i2 = i - 1; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } blockFooters.add(i); // page header candidates blockHeaders.add(i); if (i < doc.getBlocks().size() - 1) { if (doc.getBlocks().get(i + 1).getNbTokens() < 20) { Integer i2 = i + 1; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1); } } if (i < doc.getBlocks().size() - 2) { if (doc.getBlocks().get(i + 2).getNbTokens() < 20) { Integer i2 = i + 2; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2); } } if (i < doc.getBlocks().size() - 3) { if (doc.getBlocks().get(i + 3).getNbTokens() < 20) { Integer i2 = i + 3; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3); } } if (i < doc.getBlocks().size() - 4) { if (doc.getBlocks().get(i + 4).getNbTokens() < 20) { Integer i2 = i + 4; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4); } } // more ?? } } } // clustering of blocks per font (for section header and figure/table detections) addBlockToCluster(i, doc); i++; } // try to find the cluster of section titles Cluster candidateCluster = null; // System.out.println("nb clusters: " + clusters.size()); for (Cluster cluster : doc.getClusters()) { if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5)) && (cluster.getNbBlocks() < 20)) { List<Integer> blo = cluster.getBlocks2(); for (Integer b : blo) { if (blockSectionTitles.contains(b)) { if (candidateCluster == null) { candidateCluster = cluster; break; } // else if (cluster.getFontSize() >= candidateCluster.getFontSize()) // candidateCluster = cluster; } } } } if (candidateCluster != null) { List<Integer> newBlockSectionTitles = new ArrayList<Integer>(); for (Integer bl : blockSectionTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } List<Integer> blockClusterTitles = candidateCluster.getBlocks2(); if (blockClusterTitles.size() < 20) { for (Integer bl : blockClusterTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } } blockSectionTitles = newBlockSectionTitles; } // aknowledgement section recognition boolean ackn = false; i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // System.out.println(i + ": " + localText+"\n"); Integer iii = i; Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText); if ((m3.find()) && (blockSectionTitles.contains(iii))) { acknowledgementBlocks.add(iii); ackn = true; // int index = blockSectionTitles.indexOf(iii); // blockSectionTitles.remove(index); } else if ((ackn) && (blockSectionTitles.contains(iii))) { ackn = false; break; } else if (ackn) { Matcher m4 = BasicStructureBuilder.references.matcher(localText); if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) { acknowledgementBlocks.add(iii); } else if (m4.find()) { ackn = false; break; } } i++; } // we remove references headers in blockSectionTitles int index = -1; for (Integer ii : blockSectionTitles) { Block block = doc.getBlocks().get(ii); String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m4 = BasicStructureBuilder.references.matcher(localText); if (m4.find()) { index = blockSectionTitles.indexOf(ii); break; } } if (index != -1) { blockSectionTitles.remove(index); } // we check headers repetition from page to page to decide if it is an header or not ArrayList<Integer> toRemove = new ArrayList<Integer>(); for (Integer ii : blockHeaders) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("header candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockHeaders) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; // System.out.println("dist with " + localText2 + " : " + dist); if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockHeaders.remove(ii); } // same for footers toRemove = new ArrayList<Integer>(); for (Integer ii : blockFooters) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("footer candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockFooters) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockFooters.remove(ii); } // a special step for added banner repositoryies such HAL i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // HAL if (localText.startsWith("Author manuscript, published in")) { Double y = block.getY(); // System.out.println("HAL banner found, " + "block " + i + ", y = " + y); if (Math.abs(y - 12.538) < 2) { // reference position // blockHeaders.add(new Integer(i)); blockDocumentHeaders.add(i); // System.out.println("HAL banner added as header block"); break; } } // ACM publications // System.out.println("test ACM " + i); // System.out.println(localText); if (localText.startsWith("Permission to make digital or hard copies")) { blockFooters.add(i); break; } // arXiv, etc. put here // IOP if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) { blockDocumentHeaders.add(i); // System.out.println("IOP banner added as header block"); break; } i++; } // we try to recognize here table and figure blocks // the idea is that the textual elements are not located as the normal text blocks // this is recognized by exploiting the cluster of blocks starting up and down front the block // containing a table or a figure marker // two different runs, one for figures and one for tables (everything could be done in one // step) i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m = BasicStructureBuilder.figure.matcher(localText); Matcher m2 = BasicStructureBuilder.table.matcher(localText); double width = block.getWidth(); boolean bold = block.getBold(); // table // if ( (m2.find()) && (localText.length() < 200) ) { if ((m2.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadTables.contains(i)) { blockHeadTables.add(i); } // we also put all the small blocks before and after the marker int j = i - 1; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = doc.getBlocks().size(); } j++; } } // figure // else if ( (m.find()) && (localText.length() < 200) ) { else if ((m.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i); // we also put all the small blocks before and after the marker int j = i - 1; boolean imageFound = false; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { String localText2 = b.getText().trim(); // localText = localText.replace("\n", " "); localText2 = localText2.replace(" ", " "); localText2 = localText2.trim(); if ((localText2.startsWith("@IMAGE")) && (!imageFound)) { // System.out.println(localText2); block.setText(block.getText() + " " + localText2); // System.out.println(block.getText()); imageFound = true; } if ((localText2.length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().trim().length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = doc.getBlocks().size(); } j++; } } i++; } } finally { doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockSectionTitles(blockSectionTitles); doc.setAcknowledgementBlocks(acknowledgementBlocks); doc.setBlockTables(blockTables); doc.setBlockFigures(blockFigures); doc.setBlockHeadTables(blockHeadTables); doc.setBlockHeadFigures(blockHeadFigures); doc.setBlockDocumentHeaders(blockDocumentHeaders); } }
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") && // !qName.equals("TEXT")) // System.out.println(qName); if (qName.equals("TEXT")) { blabla.append("\n"); LayoutToken token = new LayoutToken(); token.setText("\n"); block.addToken(token); accumulator.setLength(0); tokenizations.add("\n"); } else if (qName.equals("METADATA")) { accumulator.setLength(0); } else if (qName.equals("TOKEN")) { String tok0 = TextUtilities.clean(getText()); if (block.getStartToken() == -1) { block.setStartToken(tokenizations.size()); } if (tok0.length() > 0) { StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true); boolean diaresis = false; boolean accent = false; boolean keepLast = false; while (st.hasMoreTokens()) { diaresis = false; accent = false; keepLast = false; String tok = st.nextToken(); if (tok.length() > 0) { LayoutToken token = new LayoutToken(); if ((previousToken != null) && (tok != null) && (previousToken.length() > 0) && (tok.length() > 0) && blabla.length() > 0) { Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1); Character rightChar = tok.charAt(0); ModifierClass leftClass = classifyChar(leftChar); ModifierClass rightClass = classifyChar(rightChar); ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER; if (leftClass != ModifierClass.NOT_A_MODIFIER || rightClass != ModifierClass.NOT_A_MODIFIER) { Character baseChar = null; Character modifierChar = null; if (leftClass != ModifierClass.NOT_A_MODIFIER) { if (rightClass != ModifierClass.NOT_A_MODIFIER) { // assert false; // keeping characters, but setting class // to not a modifier baseChar = leftChar; modifierChar = rightChar; modifierClass = ModifierClass.NOT_A_MODIFIER; } else { baseChar = rightChar; modifierChar = leftChar; modifierClass = leftClass; } } else { baseChar = leftChar; modifierChar = rightChar; modifierClass = rightClass; } String updatedChar = modifyCharacter(baseChar, modifierChar); tokenizations.remove(tokenizations.size() - 1); if (tokenizations.size() > 0) { tokenizations.remove(tokenizations.size() - 1); } blabla.deleteCharAt(blabla.length() - 1); if (blabla.length() > 0) { blabla.deleteCharAt(blabla.length() - 1); } removeLastCharacterIfPresent(previousTok); if (updatedChar != null) { blabla.append(updatedChar); previousTok.setText(previousTok.getText() + updatedChar); } blabla.append(tok.substring(1, tok.length())); previousTok.setText(previousTok.getText() + tok.substring(1, tok.length())); tokenizations.add(previousTok.getText()); diaresis = (modifierClass == ModifierClass.DIAERESIS || modifierClass == ModifierClass.NORDIC_RING || modifierClass == ModifierClass.CZECH_CARON || modifierClass == ModifierClass.TILDE || modifierClass == ModifierClass.CEDILLA); accent = (modifierClass == ModifierClass.ACUTE_ACCENT || modifierClass == ModifierClass.CIRCUMFLEX || modifierClass == ModifierClass.GRAVE_ACCENT); if (rightClass != ModifierClass.NOT_A_MODIFIER) { tok = ""; // resetting current token as it // is a single-item } } } if (tok != null) { // actually in certain cases, the extracted string under token can be a chunk of text // with separators that need to be preserved // tok = tok.replace(" ", ""); } if ((!diaresis) && (!accent)) { // blabla.append(" "); blabla.append(tok); token.setText(tok); tokenizations.add(tok); } else { tok = ""; keepLast = true; } /* * StringTokenizer st0 = new StringTokenizer(tok0, * TextUtilities.fullPunctuations, true); * while(st0.hasMoreTokens()) { String tok = * st0.nextToken(); tokenizations.add(tok); } * tokenizations.add(" "); */ /* * boolean punct1 = false; boolean punct2 = false; * boolean punct3 = false; String content = null; int i * = 0; for(; i<TextUtilities.punctuations.length(); * i++) { if (tok.length() > 0) { if * (tok.charAt(tok.length()-1) == * TextUtilities.punctuations.charAt(i)) { punct1 = * true; content = tok.substring(0, tok.length()-1); if * (tok.length() > 1) { int j = 0; for(; * j<TextUtilities.punctuations.length(); j++) { if * (tok.charAt(tok.length()-2) == * TextUtilities.punctuations.charAt(j)) { punct3 = * true; content = tok.substring(0, tok.length()-2); } } * } break; } } } if (tok.length() > 0) { if ( * (tok.startsWith("(")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("("); } else if ( * (tok.startsWith("[")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("["); } else if ( * (tok.startsWith("\"")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("\""); } } */ if (currentRotation) currentFontSize = currentFontSize / 2; /* * if (punct2) { if (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * token = new LayoutToken(); token.setText(content); } * if (punct1) { token.setText(content); if (currentFont * != null) token.setFont(currentFont.toLowerCase()); * else token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * if (punct3) { token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-2)); if * (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); } * * token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-1)); } */ if (currentFont != null) token.setFont(currentFont.toLowerCase()); else token.setFont("default"); token.setItalic(currentItalic); token.setBold(currentBold); token.setRotation(currentRotation); token.setColorFont(colorFont); token.setX(currentX); token.setY(currentY); token.setWidth(currentWidth); token.setHeight(currentHeight); token.setFontSize(currentFontSize); if (!diaresis && !accent) { block.addToken(token); } if (block.getFont() == null) { if (currentFont != null) block.setFont(currentFont.toLowerCase()); else token.setFont("default"); } if (nbTokens == 0) { block.setItalic(currentItalic); block.setBold(currentBold); } if (block.getColorFont() == null) block.setColorFont(colorFont); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize); if (!diaresis && !accent) { previousToken = tok; previousTok = token; } else { previousToken = previousTok.getText(); } nbTokens++; accumulator.setLength(0); } } if (tokenizations.size() > 0) { String justBefore = tokenizations.get(tokenizations.size() - 1); if (!justBefore.endsWith("-")) { tokenizations.add(" "); blabla.append(" "); } } } block.setEndToken(tokenizations.size()); } else if (qName.equals("PAGE")) { // page marker are usefull to detect headers (same first line(s) // appearing on each page) if (block != null) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0 = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0); block0.setPage(currentPage); doc.addBlock(block0); block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); nbTokens = 0; // blabla.append("\n@block\n"); tokenizations.add("\n"); } else if (qName.equals("IMAGE")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); if (images.size() > 0) { blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); } block.setText(blabla.toString()); block.setNbTokens(nbTokens); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) { * blabla.append("\n"); block.setText(blabla.toString()); * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new * Block(); block.setPage(currentPage); blabla = new StringBuffer(); * blabla.append("@IMAGE " + "vectorial \n"); * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) * block.setY(currentY); if (block.getWidth() == 0.0) * block.setWidth(currentWidth); if (block.getHeight() == 0.0) * block.setHeight(currentHeight); doc.addBlock(block); blabla = new * StringBuffer(); nbTokens = 0; block = new Block(); * block.setPage(currentPage); } */ else if (qName.equals("BLOCK")) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); block.setWidth(currentX - block.getX() + currentWidth); block.setHeight(currentY - block.getY() + currentHeight); doc.addBlock(block); // blabla = new StringBuffer(); nbTokens = 0; block = null; } else if (qName.equals("xi:include")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("DOCUMENT")) { * System.out.println(blabla.toString()); } */ }