Java Document.getBlocks примеры использования

Язык программирования: Java

Класс/Тип: Document

Метод/Функция: getBlocks

Примеров на hotexamples.com: 5

Java Document.getBlocks - 5 примеров найдено. Это лучшие примеры Java кода для Document.getBlocks, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

createElement(30)

createTextNode(30)

select(30)

get(30)

insertString(30)

getTextLength(30)

getCharsSequence(30)

getText(30)

appendChild(30)

getRootElement(30)

getDocumentElement(30)

getLineStartOffset(30)

add(30)

getElementsByTagName(30)

getLineEndOffset(29)

getLength(23)

getLineNumber(22)

getLineCount(19)

createRangeMarker(18)

createElementNS(16)

getContent(15)

getElementById(15)

importNode(12)

text(12)

body(11)

save(11)

replaceString(10)

deleteString(10)

remove(10)

createDivElement(9)

getDefaultRootElement(8)

check(8)

getId(8)

replaceItemValue(7)

open(7)

getGamma(7)

close(7)

getNumberOfTypes(7)

getNumberOfTopics(7)

getNumberOfTokens(7)

getFirstChild(7)

getChildNodes(7)

getHtml(6)

html(6)

getBody(6)

getName(6)

getSentence(6)

annotate(6)

toString(6)

setRootElement(6)

Пример #1

0

Показать файл

Файл: BasicStructureBuilder.java Проект: Immortalin/grobid

  /**
   * Cluster the blocks following the font, style and size aspects
   *
   * <p>-> not used at this stage, but could be an interesting feature in the full text model in the
   * future
   *
   * @param b integer
   * @param doc a document
   */
  private static void addBlockToCluster(Integer b, Document doc) {
    // get block features
    Block block = doc.getBlocks().get(b);
    String font = block.getFont();
    boolean bold = block.getBold();
    boolean italic = block.getItalic();
    double fontSize = block.getFontSize();
    boolean found = false;

    if (font == null) {
      font = "unknown";
    }
    // System.out.println(font + " " + bold + " " + italic + " " + fontSize );

    if (doc.getClusters() == null) {
      doc.setClusters(new ArrayList<Cluster>());
    } else {
      for (Cluster cluster : doc.getClusters()) {
        String font2 = cluster.getFont();
        if (font2 == null) font2 = "unknown";
        if (font.equals(font2)
            && (bold == cluster.getBold())
                & (italic == cluster.getItalic())
                & (fontSize == cluster.getFontSize())) {
          cluster.addBlock2(b);
          found = true;
        }
      }
    }

    if (!found) {
      Cluster cluster = new Cluster();
      cluster.setFont(font);
      cluster.setBold(bold);
      cluster.setItalic(italic);
      cluster.setFontSize(fontSize);
      cluster.addBlock2(b);
      doc.getClusters().add(cluster);
    }
  }

Пример #2

0

Показать файл

Файл: BasicStructureBuilder.java Проект: Immortalin/grobid

  /**
   * Filter out line numbering possibly present in the document. This can be frequent for document
   * in a review/submission format and degrades strongly the machine learning extraction results.
   *
   * <p>-> Not used !
   *
   * @param doc a document
   * @return if found numbering
   */
  public boolean filterLineNumber(Document doc) {
    // we first test if we have a line numbering by checking if we have an increasing integer
    // at the begin or the end of each block
    boolean numberBeginLine = false;
    boolean numberEndLine = false;

    boolean foundNumbering = false;

    int currentNumber = -1;
    int lastNumber = -1;
    int i = 0;
    for (Block block : doc.getBlocks()) {
      //            Integer ii = i;

      String localText = block.getText();
      List<LayoutToken> tokens = block.tokens;

      if ((localText != null) && (tokens != null)) {
        if (tokens.size() > 0) {
          // we get the first and last token iof the block
          // String tok1 = tokens.get(0).getText();
          // String tok2 = tokens.get(tokens.size()).getText();
          localText = localText.trim();

          Matcher ma1 = startNum.matcher(localText);
          Matcher ma2 = endNum.matcher(localText);

          if (ma1.find()) {
            String groupStr = ma1.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberBeginLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          } else if (ma2.find()) {
            String groupStr = ma2.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberEndLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          }

          if (lastNumber != -1) {
            if (currentNumber == lastNumber + 1) {
              foundNumbering = true;
              break;
            }
          } else lastNumber = currentNumber;
        }
      }
      i++;

      if (i > 5) {
        break;
      }
    }

    i = 0;
    if (foundNumbering) {
      // we have a line numbering, so we filter them
      int counter = 1; // we start at 1, if the actual start is 0,
      // it will remain (as it is negligeable)

      for (Block block : doc.getBlocks()) {

        String localText = block.getText();
        List<LayoutToken> tokens = block.tokens;

        if ((localText != null) && (tokens.size() > 0)) {

          if (numberEndLine) {
            Matcher ma2 = endNum.matcher(localText);

            if (ma2.find()) {
              String groupStr = ma2.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(0, localText.length() - groupStr.length());
                block.setText(localText);
                tokens.remove(tokens.size() - 1);
                counter++;
              }
            }

          } else if (numberBeginLine) {
            Matcher ma1 = endNum.matcher(localText);

            if (ma1.find()) {
              String groupStr = ma1.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(groupStr.length(), localText.length() - 1);
                block.setText(localText);
                tokens.remove(0);
                counter++;
              }
            }
          }
        }
        i++;
      }
    }

    return foundNumbering;
  }

Пример #3

0

Показать файл

Файл: BasicStructureBuilder.java Проект: Immortalin/grobid

  /**
   * Set the main segments of the document based on the full text parsing results
   *
   * @param doc a document
   * @param labeledResult string
   * @param tokenizations tokens
   * @return a document
   */
  public static Document resultSegmentation(
      Document doc, String labeledResult, List<String> tokenizations) {
    if (doc == null) {
      throw new NullPointerException("Document is null");
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException("Blocks of the documents are null");
    }
    // System.out.println(tokenizations.toString());
    //        int i = 0;
    //        boolean first = true;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();

    SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>();

    doc.setBibDataSets(new ArrayList<BibDataSet>());

    //        StringTokenizer st = new StringTokenizer(labeledResult, "\n");

    String[] lines = labeledResult.split("\n");

    String currentTag = null;
    String s2 = null;
    String lastTag = null;
    String lastPlainTag = null;

    int p = 0; // index in the results' tokenization (st)
    int blockIndex = 0;

    BibDataSet bib = null;

    DocumentPointer pointerA = null;
    //        DocumentPointer pointerB = null;
    DocumentPointer currentPointer;
    DocumentPointer lastPointer = null;

    for (String line : lines) {
      //        while (st.hasMoreTokens()) {

      for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) {
        //                int startTok = doc.getBlocks().get(blockIndex).getStartToken();
        int endTok = doc.getBlocks().get(blockIndex).getEndToken();

        if (endTok >= p) {
          break;
        }
      }

      ArrayList<String> localFeatures = new ArrayList<String>();
      boolean addSpace = false;

      //            String tok = st.nextToken().trim();
      line = line.trim();

      StringTokenizer stt = new StringTokenizer(line, "\t");
      int j = 0;

      boolean newLine = false;
      int ll = stt.countTokens();
      while (stt.hasMoreTokens()) {
        String s = stt.nextToken().trim();
        if (j == 0) {
          s2 = s;
          boolean strop = false;
          while ((!strop) && (p < tokenizations.size())) {
            String tokOriginal = tokenizations.get(p);
            if (tokOriginal.equals(" ")
                | tokOriginal.equals("\n")
                | tokOriginal.equals("\r")
                | tokOriginal.equals("\t")) {
              addSpace = true;
              p++;
            } else if (tokOriginal.equals("")) {
              p++;
            } else // if (tokOriginal.equals(s))
            {
              strop = true;
            }
          }
        } else if (j == ll - 1) {
          currentTag = s; // current tag
        } else {
          if (s.equals("LINESTART")) {
            newLine = true;
          }
          localFeatures.add(s);
        }
        j++;
      }

      if (lastTag != null) {
        if (lastTag.startsWith("I-")) {
          lastPlainTag = lastTag.substring(2, lastTag.length());
        } else {
          lastPlainTag = lastTag;
        }
      }

      String currentPlainTag = null;
      if (currentTag != null) {
        if (currentTag.startsWith("I-")) {
          currentPlainTag = currentTag.substring(2, currentTag.length());
        } else {
          currentPlainTag = currentTag;
        }
      }

      currentPointer = new DocumentPointer(doc, blockIndex, p);

      if (lastPlainTag != null
          && !currentPlainTag.equals(lastPlainTag)
          && lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
        pointerA = currentPointer;
      }

      if (currentPlainTag.equals("<header>")) {
        if (!blockDocumentHeaders.contains(blockIndex)) {
          blockDocumentHeaders.add(blockIndex);
          // System.out.println("add block header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals(
          "<references>")) { //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }

        if (currentTag.equals("I-<references>")) {
          pointerA = new DocumentPointer(doc, blockIndex, p);
          if (bib != null) {
            if (bib.getRawBib() != null) {
              doc.getBibDataSets().add(bib);
              bib = new BibDataSet();
            }
          } else {
            bib = new BibDataSet();
          }
          bib.setRawBib(s2);
        } else {
          if (addSpace) {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(" " + s2);
            } else {
              bib.setRawBib(bib.getRawBib() + " " + s2);
            }
          } else {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(s2);
            } else {
              bib.setRawBib(bib.getRawBib() + s2);
            }
          }
        }

        //                case "<reference_marker>":
        //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }
        //
        //                    if (currentTag.equals("I-<reference_marker>")) {
        //                        if (bib != null) {
        //                            if (bib.getRefSymbol() != null) {
        //                                doc.getBibDataSets().add(bib);
        //                                bib = new BibDataSet();
        //                            }
        //                        } else {
        //                            bib = new BibDataSet();
        //                        }
        //                        bib.setRefSymbol(s2);
        //                    } else {
        //                        if (addSpace) {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + " " + s2);
        //                            }
        //                        } else {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + s2);
        //                            }
        //                        }
        //                    }
        //                    break;
      } else if (currentPlainTag.equals("<page_footnote>")) {
        if (!blockFooters.contains(blockIndex)) {
          blockFooters.add(blockIndex);
          // System.out.println("add block foot note: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<page_header>")) {
        if (!blockHeaders.contains(blockIndex)) {
          blockHeaders.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<section>")) {
        if (!blockSectionTitles.contains(blockIndex)) {
          blockSectionTitles.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }
      }

      lastTag = currentTag;
      p++;
      lastPointer = currentPointer;
    }

    if (bib != null) {
      doc.getBibDataSets().add(bib);
    }

    if (!lastPointer.equals(pointerA)) {
      if (lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
      }
    }

    doc.setBlockHeaders(blockHeaders);
    doc.setBlockFooters(blockFooters);
    doc.setBlockDocumentHeaders(blockDocumentHeaders);
    doc.setBlockReferences(blockReferences);
    doc.setBlockSectionTitles(blockSectionTitles);

    return doc;
  }

Пример #4

0

Показать файл

Файл: BasicStructureBuilder.java Проект: Immortalin/grobid

  public static Document generalResultSegmentation(
      Document doc, String labeledResult, List<LayoutToken> documentTokens) {
    List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);

    SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create();
    doc.setLabeledBlocks(labeledBlocks);

    /*try {
          	FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult);
    	FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString());
    }
    catch(Exception e) {
    	e.printStackTrace();
    }*/

    List<Block> docBlocks = doc.getBlocks();
    int indexLine = 0;
    int blockIndex = 0;
    int p = 0; // position in the labeled result
    int currentLineEndPos = 0; // position in the global doc. tokenization of the last
    // token of the current line
    int currentLineStartPos = 0; // position in the global doc.
    // tokenization of the first token of the current line
    String line = null;

    DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
    DocumentPointer currentPointer = null;
    DocumentPointer lastPointer = null;

    String curLabel;
    String curPlainLabel = null;
    String lastPlainLabel = null;

    int lastTokenInd = -1;
    for (int i = docBlocks.size() - 1; i >= 0; i--) {
      int endToken = docBlocks.get(i).getEndToken();
      if (endToken != -1) {
        lastTokenInd = endToken;
        break;
      }
    }

    // we do this concatenation trick so that we don't have to process stuff after the main loop
    // no copying of lists happens because of this, so it's ok to concatenate
    String ignoredLabel = "@IGNORED_LABEL@";
    for (Pair<String, String> labeledTokenPair :
        Iterables.concat(
            labeledTokens,
            Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) {
      if (labeledTokenPair == null) {
        p++;
        continue;
      }

      // as we process the document segmentation line by line, we don't use the usual
      // tokenization to rebuild the text flow, but we get each line again from the
      // text stored in the document blocks (similarly as when generating the features)
      line = null;
      while ((line == null) && (blockIndex < docBlocks.size())) {
        Block block = docBlocks.get(blockIndex);
        List<LayoutToken> tokens = block.getTokens();
        String localText = block.getText();
        if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        }
        String[] lines = localText.split("[\\n\\r]");
        if ((lines.length == 0) || (indexLine >= lines.length)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        } else {
          line = lines[indexLine];
          indexLine++;
          if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) {
            line = null;
            continue;
          }

          if (currentLineStartPos > lastTokenInd) continue;

          // adjust the start token position in documentTokens to this non trivial line
          // first skip possible space characters and tabs at the beginning of the line
          while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                  || documentTokens.get(currentLineStartPos).t().equals("\t"))
              && (currentLineStartPos != lastTokenInd)) {
            currentLineStartPos++;
          }
          if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) {
            while (currentLineStartPos < block.getEndToken()) {
              if (documentTokens.get(currentLineStartPos).t().equals("\n")
                  || documentTokens.get(currentLineStartPos).t().equals("\r")) {
                // move to the start of the next line, but ignore space characters and tabs
                currentLineStartPos++;
                while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                        || documentTokens.get(currentLineStartPos).t().equals("\t"))
                    && (currentLineStartPos != lastTokenInd)) {
                  currentLineStartPos++;
                }
                if ((currentLineStartPos != lastTokenInd)
                    && labeledTokenPair.a.startsWith(
                        documentTokens.get(currentLineStartPos).getText())) {
                  break;
                }
              }
              currentLineStartPos++;
            }
          }

          // what is then the position of the last token of this line?
          currentLineEndPos = currentLineStartPos;
          while (currentLineEndPos < block.getEndToken()) {
            if (documentTokens.get(currentLineEndPos).t().equals("\n")
                || documentTokens.get(currentLineEndPos).t().equals("\r")) {
              currentLineEndPos--;
              break;
            }
            currentLineEndPos++;
          }
        }
      }
      curLabel = labeledTokenPair.b;
      curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel);

      /*System.out.println("-------------------------------");
      System.out.println("block: " + blockIndex);
      System.out.println("line: " + line);
      System.out.println("token: " + labeledTokenPair.a);
      System.out.println("curPlainLabel: " + curPlainLabel);
      System.out.println("lastPlainLabel: " + lastPlainLabel);
      if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1))
      	System.out.println("currentLineStartPos: " + currentLineStartPos +
      								" (" + documentTokens.get(currentLineStartPos) + ")");
      if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1))
      	System.out.println("currentLineEndPos: " + currentLineEndPos +
      								" (" + documentTokens.get(currentLineEndPos) + ")");*/

      if (blockIndex == docBlocks.size()) {
        break;
      }

      currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos);

      // either a new entity starts or a new beginning of the same type of entity
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
        }
        pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos);
        // System.out.println("add segment for: " + lastPlainLabel + ", until " +
        // (currentLineStartPos-2));
      }

      // updating stuff for next iteration
      lastPlainLabel = curPlainLabel;
      lastPointer = currentPointer;
      currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line
      p++;
    }

    if (blockIndex == docBlocks.size()) {
      // the last labelled piece has still to be added
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
          // System.out.println("add segment for: " + lastPlainLabel + ", until " +
          // (currentLineStartPos-2));
        }
      }
    }

    return doc;
  }

Пример #5

0

Показать файл

Файл: BasicStructureBuilder.java Проект: Immortalin/grobid

  /**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }