/**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }
  /**
   * Filter out line numbering possibly present in the document. This can be frequent for document
   * in a review/submission format and degrades strongly the machine learning extraction results.
   *
   * <p>-> Not used !
   *
   * @param doc a document
   * @return if found numbering
   */
  public boolean filterLineNumber(Document doc) {
    // we first test if we have a line numbering by checking if we have an increasing integer
    // at the begin or the end of each block
    boolean numberBeginLine = false;
    boolean numberEndLine = false;

    boolean foundNumbering = false;

    int currentNumber = -1;
    int lastNumber = -1;
    int i = 0;
    for (Block block : doc.getBlocks()) {
      //            Integer ii = i;

      String localText = block.getText();
      List<LayoutToken> tokens = block.tokens;

      if ((localText != null) && (tokens != null)) {
        if (tokens.size() > 0) {
          // we get the first and last token iof the block
          // String tok1 = tokens.get(0).getText();
          // String tok2 = tokens.get(tokens.size()).getText();
          localText = localText.trim();

          Matcher ma1 = startNum.matcher(localText);
          Matcher ma2 = endNum.matcher(localText);

          if (ma1.find()) {
            String groupStr = ma1.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberBeginLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          } else if (ma2.find()) {
            String groupStr = ma2.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberEndLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          }

          if (lastNumber != -1) {
            if (currentNumber == lastNumber + 1) {
              foundNumbering = true;
              break;
            }
          } else lastNumber = currentNumber;
        }
      }
      i++;

      if (i > 5) {
        break;
      }
    }

    i = 0;
    if (foundNumbering) {
      // we have a line numbering, so we filter them
      int counter = 1; // we start at 1, if the actual start is 0,
      // it will remain (as it is negligeable)

      for (Block block : doc.getBlocks()) {

        String localText = block.getText();
        List<LayoutToken> tokens = block.tokens;

        if ((localText != null) && (tokens.size() > 0)) {

          if (numberEndLine) {
            Matcher ma2 = endNum.matcher(localText);

            if (ma2.find()) {
              String groupStr = ma2.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(0, localText.length() - groupStr.length());
                block.setText(localText);
                tokens.remove(tokens.size() - 1);
                counter++;
              }
            }

          } else if (numberBeginLine) {
            Matcher ma1 = endNum.matcher(localText);

            if (ma1.find()) {
              String groupStr = ma1.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(groupStr.length(), localText.length() - 1);
                block.setText(localText);
                tokens.remove(0);
                counter++;
              }
            }
          }
        }
        i++;
      }
    }

    return foundNumbering;
  }
示例#3
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }