예제 #1
0
  /**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }