public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {

    try {
      if (qName.equals("p") || qName.equals("description")) {
        writer.write(getText());
        accumulator.setLength(0);
      }

      if (qName.equals("description")) {
        counting = false;
      }

      if (!counting) {
        writer.write(getText());
        accumulator.setLength(0);
        writer.write("</" + qName + ">\n");
      } else {
        if (qName.equals("row")) {
          accumulator.append(" ");
        }
        if (qName.equals("p")) {
          writer.write("\n");
          accumulator.append(" ");
        }
      }
    } catch (Exception e) {
      //		    e.printStackTrace();
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    try {
      // we output the remaining text
      if (!counting) {
        writer.write(getText());
        accumulator.setLength(0);
      }
      if (!counting) {
        writer.write("<" + qName);

        int length = atts.getLength();

        // Process each attribute
        for (int i = 0; i < length; i++) {
          // Get names and values for each attribute
          String name = atts.getQName(i);
          String value = atts.getValue(i);

          if ((name != null) && (value != null)) {
            writer.write(" " + name + "=\"" + value + "\"");
          }
        }

        writer.write(">");
      }

      if (qName.equals("description")) {
        offset = 0;
        counting = true;
      } else if (qName.equals("patent-document")) {
        counting = false;
      }
    } catch (Exception e) {
      //		    e.printStackTrace();
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
Beispiel #3
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      nbNPLRef = 0;
      nbPatentRef = 0;
      nbAllRef = 0;
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("lang")) {
            // Global_Language_Code = value.toLowerCase();
          }
          if (name.equals("doc-number")) {
            PatentNumber = "EP" + value;
          }
          if (name.equals("kind")) {
            CodeType = value;
          }
          if (name.equals("date")) {
            PublicDate = value;
          }
        }
      }

      CitedPatentNumber = new ArrayList<String>();
      accumulatedText = new StringBuffer();
      allContent = new StringBuffer();
      accumulator.setLength(0);
    } else if (qName.equals("description")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      int length = atts.getLength();
      nbAllRef++;
      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("type") || name.equals("typ")) {
            if (value.equals("npl") || value.equals("book") || value.equals("journal")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              // while (st.hasMoreTokens()) {
              for (String token : tokenizations) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);

              npl = true;
              ref = true;
            } else if (value.equals("patent") || value.equals("pl")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              //	TextUtilities.segment(content,"[("+TextUtilities.punctuations);
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              for (String token : tokenizations) {
                // while (st.hasMoreTokens()) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N)) | (refFound & (j < N))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);
              npl = false;
              ref = true;
            } else {
              System.out.println("Warning: unknown attribute value for ref or bibl: " + value);
              ref = false;
              npl = false;
            }
          }
        }
      }

      accumulatorRef.setLength(0);
    } else if (qName.equals("claim")) {
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("patcit")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("ucid")) {
            cited_number = value;
            // we normally need to normalize a little bit this patent nummer
          }
        }
      }
    }
  }
Beispiel #4
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    if (qName.equals("date")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      String refString = getRefText();
      refString = refString.replace("\n", " ");
      refString = refString.replace("\t", " ");
      refString = refString.replace("  ", " ");

      if (npl && ref) {
        if (referencesNPL == null) referencesNPL = new ArrayList<String>();
        referencesNPL.add(refString);
        refFound = true;
        if (nplReferences) nbNPLRef++;
      } else if (ref) {
        if (referencesPatent == null) {
          referencesPatent = new HashMap<String, ArrayList<String>>();
        }
        ArrayList<String> refss = referencesPatent.get(currentFileName);

        if (refss == null) {
          refss = new ArrayList<String>();
        }

        refss.add(refString);
        referencesPatent.put(currentFileName, refss);
        refFound = true;
        if (patentReferences) {
          nbPatentRef++;
        }
      }

      if (refFound) {
        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(refString,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(refString, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(refString);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          try {
            accumulatedText.append(token + "\t");
            allContent.append(token + " ");
            if (npl) {
              if (nplReferences) {
                if (i == 0) {
                  // accumulatedText.append("refNPLBegin\n");
                  accumulatedText.append("I-<refNPL>\n");
                } else if (token == null) {
                  // accumulatedText.append("refNPLEnd\n");
                  accumulatedText.append("E-<refNPL>\n");
                } else {
                  accumulatedText.append("<refNPL>\n");
                }
              } else accumulatedText.append("<other>\n");
            } else {
              if (patentReferences) {
                if (i == 0) accumulatedText.append("I-<refPatent>\n");
                else if (token == null) accumulatedText.append("E-<refPatent>\n");
                else accumulatedText.append("<refPatent>\n");
              } else accumulatedText.append("<other>\n");
            }
          } catch (Exception e) {
            //						e.printStackTrace();
            throw new GrobidException("An exception occured while running Grobid.", e);
          }
          i++;
        }
      }
      ref = false;
    } else if (qName.equals("classification-ipcr")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("abstract")) {
      accumulator.setLength(0);
    } else if (qName.equals("heading")) {
      accumulator.append(" ");
    } else if (qName.equals("description")) {
      if (refFound) {
        String content = getText();

        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(content,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(content, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(content);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          // we print only a window of N words
          if ((i > N) && (N != -1)) {
            // break;
            token = token.trim();
            if (token.length() > 0) {
              accumulatedText.append(token + "\t" + "<ignore>\n");
              allContent.append(token + " ");
            }
          } else {
            try {
              token = token.trim();
              if (token.length() > 0) {
                accumulatedText.append(token + "\t" + "<other>\n");
                allContent.append(token + " ");
              }
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);
            }
          }
          i++;
        }

        accumulator.setLength(0);
        refFound = false;
      }
    } else if (qName.equals("patcit")) {
      // we register the citation, the citation context will be marked in a later stage
      if (citations == null) citations = new ArrayList<String>();
      citations.add(cited_number);
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("applicants")) {
      accumulator.setLength(0);
    } else if (qName.equals("inventors")) {
      accumulator.setLength(0);
    } else if (qName.equals("document-id")) {
      accumulator.setLength(0);
    } else if (qName.equals("legal-status")) {
      accumulator.setLength(0);
    } else if (qName.equals("bibliographic-data")) {
      accumulator.setLength(0);
    } else if (qName.equals("doc-number")) {
      accumulator.setLength(0);
    } else if (qName.equals("country")) {
      accumulator.setLength(0);
    } else if (qName.equals("kind")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-ecla")) {
      accumulator.setLength(0);
    } else if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      String allString = allContent.toString();
      journalsPositions = lexicon.inJournalNames(allString);
      abbrevJournalsPositions = lexicon.inAbbrevJournalNames(allString);
      conferencesPositions = lexicon.inConferenceNames(allString);
      publishersPositions = lexicon.inPublisherNames(allString);
      allContent = null;
      allString = null;
    } else if (qName.equals("row")) {
      accumulator.append(" ");
    } else if (qName.equals("p")) {
      accumulator.append("\n");
    }
  }
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }