/** * First pass to detect basic structures: remove page header/footer, identify section numbering, * identify Figure and table blocks. * * <p>-> to be removed at some point! * * @param doc a document */ public static void firstPass(Document doc) { if (doc == null) { throw new NullPointerException(); } if (doc.getBlocks() == null) { throw new NullPointerException(); } int i = 0; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); List<Integer> acknowledgementBlocks = new ArrayList<Integer>(); List<Integer> blockTables = new ArrayList<Integer>(); List<Integer> blockFigures = new ArrayList<Integer>(); List<Integer> blockHeadTables = new ArrayList<Integer>(); List<Integer> blockHeadFigures = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); doc.setTitleMatchNum(false); try { for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText); Matcher ma2 = BasicStructureBuilder.references.matcher(localText); if ((ma1.find()) || (ma2.find())) { if (((localText.startsWith("1.")) || (localText.startsWith("1 "))) || ((localText.startsWith("2.")) || (localText.startsWith("2 "))) || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true); // System.out.println("Title section identified: block " + i + ", " + localText); blockSectionTitles.add(i); } else { StringTokenizer st = new StringTokenizer(localText, "\n"); while (st.hasMoreTokens()) { String token = st.nextToken(); if (token.startsWith("@PAGE")) { // current block should give the header/footors if (i > 4) { if (doc.getBlocks().get(i - 5).getNbTokens() < 20) { Integer i2 = i - 5; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 3) { if (doc.getBlocks().get(i - 4).getNbTokens() < 20) { Integer i2 = i - 4; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 2) { if (doc.getBlocks().get(i - 3).getNbTokens() < 20) { Integer i2 = i - 3; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 1) { if (doc.getBlocks().get(i - 2).getNbTokens() < 20) { Integer i2 = i - 2; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 0) { if (doc.getBlocks().get(i - 1).getNbTokens() < 20) { Integer i2 = i - 1; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } blockFooters.add(i); // page header candidates blockHeaders.add(i); if (i < doc.getBlocks().size() - 1) { if (doc.getBlocks().get(i + 1).getNbTokens() < 20) { Integer i2 = i + 1; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1); } } if (i < doc.getBlocks().size() - 2) { if (doc.getBlocks().get(i + 2).getNbTokens() < 20) { Integer i2 = i + 2; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2); } } if (i < doc.getBlocks().size() - 3) { if (doc.getBlocks().get(i + 3).getNbTokens() < 20) { Integer i2 = i + 3; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3); } } if (i < doc.getBlocks().size() - 4) { if (doc.getBlocks().get(i + 4).getNbTokens() < 20) { Integer i2 = i + 4; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4); } } // more ?? } } } // clustering of blocks per font (for section header and figure/table detections) addBlockToCluster(i, doc); i++; } // try to find the cluster of section titles Cluster candidateCluster = null; // System.out.println("nb clusters: " + clusters.size()); for (Cluster cluster : doc.getClusters()) { if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5)) && (cluster.getNbBlocks() < 20)) { List<Integer> blo = cluster.getBlocks2(); for (Integer b : blo) { if (blockSectionTitles.contains(b)) { if (candidateCluster == null) { candidateCluster = cluster; break; } // else if (cluster.getFontSize() >= candidateCluster.getFontSize()) // candidateCluster = cluster; } } } } if (candidateCluster != null) { List<Integer> newBlockSectionTitles = new ArrayList<Integer>(); for (Integer bl : blockSectionTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } List<Integer> blockClusterTitles = candidateCluster.getBlocks2(); if (blockClusterTitles.size() < 20) { for (Integer bl : blockClusterTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } } blockSectionTitles = newBlockSectionTitles; } // aknowledgement section recognition boolean ackn = false; i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // System.out.println(i + ": " + localText+"\n"); Integer iii = i; Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText); if ((m3.find()) && (blockSectionTitles.contains(iii))) { acknowledgementBlocks.add(iii); ackn = true; // int index = blockSectionTitles.indexOf(iii); // blockSectionTitles.remove(index); } else if ((ackn) && (blockSectionTitles.contains(iii))) { ackn = false; break; } else if (ackn) { Matcher m4 = BasicStructureBuilder.references.matcher(localText); if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) { acknowledgementBlocks.add(iii); } else if (m4.find()) { ackn = false; break; } } i++; } // we remove references headers in blockSectionTitles int index = -1; for (Integer ii : blockSectionTitles) { Block block = doc.getBlocks().get(ii); String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m4 = BasicStructureBuilder.references.matcher(localText); if (m4.find()) { index = blockSectionTitles.indexOf(ii); break; } } if (index != -1) { blockSectionTitles.remove(index); } // we check headers repetition from page to page to decide if it is an header or not ArrayList<Integer> toRemove = new ArrayList<Integer>(); for (Integer ii : blockHeaders) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("header candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockHeaders) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; // System.out.println("dist with " + localText2 + " : " + dist); if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockHeaders.remove(ii); } // same for footers toRemove = new ArrayList<Integer>(); for (Integer ii : blockFooters) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("footer candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockFooters) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockFooters.remove(ii); } // a special step for added banner repositoryies such HAL i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // HAL if (localText.startsWith("Author manuscript, published in")) { Double y = block.getY(); // System.out.println("HAL banner found, " + "block " + i + ", y = " + y); if (Math.abs(y - 12.538) < 2) { // reference position // blockHeaders.add(new Integer(i)); blockDocumentHeaders.add(i); // System.out.println("HAL banner added as header block"); break; } } // ACM publications // System.out.println("test ACM " + i); // System.out.println(localText); if (localText.startsWith("Permission to make digital or hard copies")) { blockFooters.add(i); break; } // arXiv, etc. put here // IOP if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) { blockDocumentHeaders.add(i); // System.out.println("IOP banner added as header block"); break; } i++; } // we try to recognize here table and figure blocks // the idea is that the textual elements are not located as the normal text blocks // this is recognized by exploiting the cluster of blocks starting up and down front the block // containing a table or a figure marker // two different runs, one for figures and one for tables (everything could be done in one // step) i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m = BasicStructureBuilder.figure.matcher(localText); Matcher m2 = BasicStructureBuilder.table.matcher(localText); double width = block.getWidth(); boolean bold = block.getBold(); // table // if ( (m2.find()) && (localText.length() < 200) ) { if ((m2.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadTables.contains(i)) { blockHeadTables.add(i); } // we also put all the small blocks before and after the marker int j = i - 1; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = doc.getBlocks().size(); } j++; } } // figure // else if ( (m.find()) && (localText.length() < 200) ) { else if ((m.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i); // we also put all the small blocks before and after the marker int j = i - 1; boolean imageFound = false; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { String localText2 = b.getText().trim(); // localText = localText.replace("\n", " "); localText2 = localText2.replace(" ", " "); localText2 = localText2.trim(); if ((localText2.startsWith("@IMAGE")) && (!imageFound)) { // System.out.println(localText2); block.setText(block.getText() + " " + localText2); // System.out.println(block.getText()); imageFound = true; } if ((localText2.length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().trim().length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = doc.getBlocks().size(); } j++; } } i++; } } finally { doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockSectionTitles(blockSectionTitles); doc.setAcknowledgementBlocks(acknowledgementBlocks); doc.setBlockTables(blockTables); doc.setBlockFigures(blockFigures); doc.setBlockHeadTables(blockHeadTables); doc.setBlockHeadFigures(blockHeadFigures); doc.setBlockDocumentHeaders(blockDocumentHeaders); } }