Пример #1
0
  /**
   * Cluster the blocks following the font, style and size aspects
   *
   * <p>-> not used at this stage, but could be an interesting feature in the full text model in the
   * future
   *
   * @param b integer
   * @param doc a document
   */
  private static void addBlockToCluster(Integer b, Document doc) {
    // get block features
    Block block = doc.getBlocks().get(b);
    String font = block.getFont();
    boolean bold = block.getBold();
    boolean italic = block.getItalic();
    double fontSize = block.getFontSize();
    boolean found = false;

    if (font == null) {
      font = "unknown";
    }
    // System.out.println(font + " " + bold + " " + italic + " " + fontSize );

    if (doc.getClusters() == null) {
      doc.setClusters(new ArrayList<Cluster>());
    } else {
      for (Cluster cluster : doc.getClusters()) {
        String font2 = cluster.getFont();
        if (font2 == null) font2 = "unknown";
        if (font.equals(font2)
            && (bold == cluster.getBold())
                & (italic == cluster.getItalic())
                & (fontSize == cluster.getFontSize())) {
          cluster.addBlock2(b);
          found = true;
        }
      }
    }

    if (!found) {
      Cluster cluster = new Cluster();
      cluster.setFont(font);
      cluster.setBold(bold);
      cluster.setItalic(italic);
      cluster.setFontSize(fontSize);
      cluster.addBlock2(b);
      doc.getClusters().add(cluster);
    }
  }
Пример #2
0
  /**
   * Filter out line numbering possibly present in the document. This can be frequent for document
   * in a review/submission format and degrades strongly the machine learning extraction results.
   *
   * <p>-> Not used !
   *
   * @param doc a document
   * @return if found numbering
   */
  public boolean filterLineNumber(Document doc) {
    // we first test if we have a line numbering by checking if we have an increasing integer
    // at the begin or the end of each block
    boolean numberBeginLine = false;
    boolean numberEndLine = false;

    boolean foundNumbering = false;

    int currentNumber = -1;
    int lastNumber = -1;
    int i = 0;
    for (Block block : doc.getBlocks()) {
      //            Integer ii = i;

      String localText = block.getText();
      List<LayoutToken> tokens = block.tokens;

      if ((localText != null) && (tokens != null)) {
        if (tokens.size() > 0) {
          // we get the first and last token iof the block
          // String tok1 = tokens.get(0).getText();
          // String tok2 = tokens.get(tokens.size()).getText();
          localText = localText.trim();

          Matcher ma1 = startNum.matcher(localText);
          Matcher ma2 = endNum.matcher(localText);

          if (ma1.find()) {
            String groupStr = ma1.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberBeginLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          } else if (ma2.find()) {
            String groupStr = ma2.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberEndLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          }

          if (lastNumber != -1) {
            if (currentNumber == lastNumber + 1) {
              foundNumbering = true;
              break;
            }
          } else lastNumber = currentNumber;
        }
      }
      i++;

      if (i > 5) {
        break;
      }
    }

    i = 0;
    if (foundNumbering) {
      // we have a line numbering, so we filter them
      int counter = 1; // we start at 1, if the actual start is 0,
      // it will remain (as it is negligeable)

      for (Block block : doc.getBlocks()) {

        String localText = block.getText();
        List<LayoutToken> tokens = block.tokens;

        if ((localText != null) && (tokens.size() > 0)) {

          if (numberEndLine) {
            Matcher ma2 = endNum.matcher(localText);

            if (ma2.find()) {
              String groupStr = ma2.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(0, localText.length() - groupStr.length());
                block.setText(localText);
                tokens.remove(tokens.size() - 1);
                counter++;
              }
            }

          } else if (numberBeginLine) {
            Matcher ma1 = endNum.matcher(localText);

            if (ma1.find()) {
              String groupStr = ma1.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(groupStr.length(), localText.length() - 1);
                block.setText(localText);
                tokens.remove(0);
                counter++;
              }
            }
          }
        }
        i++;
      }
    }

    return foundNumbering;
  }
Пример #3
0
  /**
   * Set the main segments of the document based on the full text parsing results
   *
   * @param doc a document
   * @param labeledResult string
   * @param tokenizations tokens
   * @return a document
   */
  public static Document resultSegmentation(
      Document doc, String labeledResult, List<String> tokenizations) {
    if (doc == null) {
      throw new NullPointerException("Document is null");
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException("Blocks of the documents are null");
    }
    // System.out.println(tokenizations.toString());
    //        int i = 0;
    //        boolean first = true;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();

    SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>();

    doc.setBibDataSets(new ArrayList<BibDataSet>());

    //        StringTokenizer st = new StringTokenizer(labeledResult, "\n");

    String[] lines = labeledResult.split("\n");

    String currentTag = null;
    String s2 = null;
    String lastTag = null;
    String lastPlainTag = null;

    int p = 0; // index in the results' tokenization (st)
    int blockIndex = 0;

    BibDataSet bib = null;

    DocumentPointer pointerA = null;
    //        DocumentPointer pointerB = null;
    DocumentPointer currentPointer;
    DocumentPointer lastPointer = null;

    for (String line : lines) {
      //        while (st.hasMoreTokens()) {

      for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) {
        //                int startTok = doc.getBlocks().get(blockIndex).getStartToken();
        int endTok = doc.getBlocks().get(blockIndex).getEndToken();

        if (endTok >= p) {
          break;
        }
      }

      ArrayList<String> localFeatures = new ArrayList<String>();
      boolean addSpace = false;

      //            String tok = st.nextToken().trim();
      line = line.trim();

      StringTokenizer stt = new StringTokenizer(line, "\t");
      int j = 0;

      boolean newLine = false;
      int ll = stt.countTokens();
      while (stt.hasMoreTokens()) {
        String s = stt.nextToken().trim();
        if (j == 0) {
          s2 = s;
          boolean strop = false;
          while ((!strop) && (p < tokenizations.size())) {
            String tokOriginal = tokenizations.get(p);
            if (tokOriginal.equals(" ")
                | tokOriginal.equals("\n")
                | tokOriginal.equals("\r")
                | tokOriginal.equals("\t")) {
              addSpace = true;
              p++;
            } else if (tokOriginal.equals("")) {
              p++;
            } else // if (tokOriginal.equals(s))
            {
              strop = true;
            }
          }
        } else if (j == ll - 1) {
          currentTag = s; // current tag
        } else {
          if (s.equals("LINESTART")) {
            newLine = true;
          }
          localFeatures.add(s);
        }
        j++;
      }

      if (lastTag != null) {
        if (lastTag.startsWith("I-")) {
          lastPlainTag = lastTag.substring(2, lastTag.length());
        } else {
          lastPlainTag = lastTag;
        }
      }

      String currentPlainTag = null;
      if (currentTag != null) {
        if (currentTag.startsWith("I-")) {
          currentPlainTag = currentTag.substring(2, currentTag.length());
        } else {
          currentPlainTag = currentTag;
        }
      }

      currentPointer = new DocumentPointer(doc, blockIndex, p);

      if (lastPlainTag != null
          && !currentPlainTag.equals(lastPlainTag)
          && lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
        pointerA = currentPointer;
      }

      if (currentPlainTag.equals("<header>")) {
        if (!blockDocumentHeaders.contains(blockIndex)) {
          blockDocumentHeaders.add(blockIndex);
          // System.out.println("add block header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals(
          "<references>")) { //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }

        if (currentTag.equals("I-<references>")) {
          pointerA = new DocumentPointer(doc, blockIndex, p);
          if (bib != null) {
            if (bib.getRawBib() != null) {
              doc.getBibDataSets().add(bib);
              bib = new BibDataSet();
            }
          } else {
            bib = new BibDataSet();
          }
          bib.setRawBib(s2);
        } else {
          if (addSpace) {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(" " + s2);
            } else {
              bib.setRawBib(bib.getRawBib() + " " + s2);
            }
          } else {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(s2);
            } else {
              bib.setRawBib(bib.getRawBib() + s2);
            }
          }
        }

        //                case "<reference_marker>":
        //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }
        //
        //                    if (currentTag.equals("I-<reference_marker>")) {
        //                        if (bib != null) {
        //                            if (bib.getRefSymbol() != null) {
        //                                doc.getBibDataSets().add(bib);
        //                                bib = new BibDataSet();
        //                            }
        //                        } else {
        //                            bib = new BibDataSet();
        //                        }
        //                        bib.setRefSymbol(s2);
        //                    } else {
        //                        if (addSpace) {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + " " + s2);
        //                            }
        //                        } else {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + s2);
        //                            }
        //                        }
        //                    }
        //                    break;
      } else if (currentPlainTag.equals("<page_footnote>")) {
        if (!blockFooters.contains(blockIndex)) {
          blockFooters.add(blockIndex);
          // System.out.println("add block foot note: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<page_header>")) {
        if (!blockHeaders.contains(blockIndex)) {
          blockHeaders.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<section>")) {
        if (!blockSectionTitles.contains(blockIndex)) {
          blockSectionTitles.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }
      }

      lastTag = currentTag;
      p++;
      lastPointer = currentPointer;
    }

    if (bib != null) {
      doc.getBibDataSets().add(bib);
    }

    if (!lastPointer.equals(pointerA)) {
      if (lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
      }
    }

    doc.setBlockHeaders(blockHeaders);
    doc.setBlockFooters(blockFooters);
    doc.setBlockDocumentHeaders(blockDocumentHeaders);
    doc.setBlockReferences(blockReferences);
    doc.setBlockSectionTitles(blockSectionTitles);

    return doc;
  }
Пример #4
0
  public static Document generalResultSegmentation(
      Document doc, String labeledResult, List<LayoutToken> documentTokens) {
    List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);

    SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create();
    doc.setLabeledBlocks(labeledBlocks);

    /*try {
          	FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult);
    	FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString());
    }
    catch(Exception e) {
    	e.printStackTrace();
    }*/

    List<Block> docBlocks = doc.getBlocks();
    int indexLine = 0;
    int blockIndex = 0;
    int p = 0; // position in the labeled result
    int currentLineEndPos = 0; // position in the global doc. tokenization of the last
    // token of the current line
    int currentLineStartPos = 0; // position in the global doc.
    // tokenization of the first token of the current line
    String line = null;

    DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
    DocumentPointer currentPointer = null;
    DocumentPointer lastPointer = null;

    String curLabel;
    String curPlainLabel = null;
    String lastPlainLabel = null;

    int lastTokenInd = -1;
    for (int i = docBlocks.size() - 1; i >= 0; i--) {
      int endToken = docBlocks.get(i).getEndToken();
      if (endToken != -1) {
        lastTokenInd = endToken;
        break;
      }
    }

    // we do this concatenation trick so that we don't have to process stuff after the main loop
    // no copying of lists happens because of this, so it's ok to concatenate
    String ignoredLabel = "@IGNORED_LABEL@";
    for (Pair<String, String> labeledTokenPair :
        Iterables.concat(
            labeledTokens,
            Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) {
      if (labeledTokenPair == null) {
        p++;
        continue;
      }

      // as we process the document segmentation line by line, we don't use the usual
      // tokenization to rebuild the text flow, but we get each line again from the
      // text stored in the document blocks (similarly as when generating the features)
      line = null;
      while ((line == null) && (blockIndex < docBlocks.size())) {
        Block block = docBlocks.get(blockIndex);
        List<LayoutToken> tokens = block.getTokens();
        String localText = block.getText();
        if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        }
        String[] lines = localText.split("[\\n\\r]");
        if ((lines.length == 0) || (indexLine >= lines.length)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        } else {
          line = lines[indexLine];
          indexLine++;
          if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) {
            line = null;
            continue;
          }

          if (currentLineStartPos > lastTokenInd) continue;

          // adjust the start token position in documentTokens to this non trivial line
          // first skip possible space characters and tabs at the beginning of the line
          while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                  || documentTokens.get(currentLineStartPos).t().equals("\t"))
              && (currentLineStartPos != lastTokenInd)) {
            currentLineStartPos++;
          }
          if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) {
            while (currentLineStartPos < block.getEndToken()) {
              if (documentTokens.get(currentLineStartPos).t().equals("\n")
                  || documentTokens.get(currentLineStartPos).t().equals("\r")) {
                // move to the start of the next line, but ignore space characters and tabs
                currentLineStartPos++;
                while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                        || documentTokens.get(currentLineStartPos).t().equals("\t"))
                    && (currentLineStartPos != lastTokenInd)) {
                  currentLineStartPos++;
                }
                if ((currentLineStartPos != lastTokenInd)
                    && labeledTokenPair.a.startsWith(
                        documentTokens.get(currentLineStartPos).getText())) {
                  break;
                }
              }
              currentLineStartPos++;
            }
          }

          // what is then the position of the last token of this line?
          currentLineEndPos = currentLineStartPos;
          while (currentLineEndPos < block.getEndToken()) {
            if (documentTokens.get(currentLineEndPos).t().equals("\n")
                || documentTokens.get(currentLineEndPos).t().equals("\r")) {
              currentLineEndPos--;
              break;
            }
            currentLineEndPos++;
          }
        }
      }
      curLabel = labeledTokenPair.b;
      curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel);

      /*System.out.println("-------------------------------");
      System.out.println("block: " + blockIndex);
      System.out.println("line: " + line);
      System.out.println("token: " + labeledTokenPair.a);
      System.out.println("curPlainLabel: " + curPlainLabel);
      System.out.println("lastPlainLabel: " + lastPlainLabel);
      if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1))
      	System.out.println("currentLineStartPos: " + currentLineStartPos +
      								" (" + documentTokens.get(currentLineStartPos) + ")");
      if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1))
      	System.out.println("currentLineEndPos: " + currentLineEndPos +
      								" (" + documentTokens.get(currentLineEndPos) + ")");*/

      if (blockIndex == docBlocks.size()) {
        break;
      }

      currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos);

      // either a new entity starts or a new beginning of the same type of entity
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
        }
        pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos);
        // System.out.println("add segment for: " + lastPlainLabel + ", until " +
        // (currentLineStartPos-2));
      }

      // updating stuff for next iteration
      lastPlainLabel = curPlainLabel;
      lastPointer = currentPointer;
      currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line
      p++;
    }

    if (blockIndex == docBlocks.size()) {
      // the last labelled piece has still to be added
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
          // System.out.println("add segment for: " + lastPlainLabel + ", until " +
          // (currentLineStartPos-2));
        }
      }
    }

    return doc;
  }
Пример #5
0
  /**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }