Ejemplo n.º 1
0
  /** Generates array of XContour from local contours and modules. Used for TTF building. */
  private XContour[] toContours() {
    XContour[] retval;
    ArrayList<XContour> list = new ArrayList<>();
    XContour[] contours = m_glyph.getBody().getContour();
    for (int i = 0; i < contours.length; i++) {
      EContour contour = (EContour) contours[i];
      list.add(contour.toQuadratic());
    } // for i

    XModule[] modules = m_glyph.getBody().getModule();
    for (int i = 0; i < modules.length; i++) {
      EModuleInvoke module = (EModuleInvoke) modules[i];

      // push and pop happens inside toContour
      list.add(module.toContour(new AffineTransform()));
    } // for i

    if (list.size() == 0) return null;

    retval = new XContour[list.size()];
    for (int i = 0; i < list.size(); i++) {
      retval[i] = list.get(i);
    } // for i

    return retval;
  }
Ejemplo n.º 2
0
 public static void parseDocumentFragment(Reader reader, XMLReceiver xmlReceiver)
     throws SAXException {
   try {
     final XMLReader xmlReader = newSAXParser(XMLUtils.ParserConfiguration.PLAIN).getXMLReader();
     xmlReader.setContentHandler(new XMLFragmentReceiver(xmlReceiver));
     final ArrayList<Reader> readers = new ArrayList<Reader>(3);
     readers.add(new StringReader("<root>"));
     readers.add(reader);
     readers.add(new StringReader("</root>"));
     xmlReader.parse(new InputSource(new SequenceReader(readers.iterator())));
   } catch (IOException e) {
     throw new OXFException(e);
   }
 }
  public ArrayList<String> parseXML() throws Exception {
    ArrayList<String> ret = new ArrayList<String>();

    handshake();

    URL url =
        new URL(
            "http://mangaonweb.com/page.do?cdn="
                + cdn
                + "&cpn=book.xml&crcod="
                + crcod
                + "&rid="
                + (int) (Math.random() * 10000));
    String page = DownloaderUtils.getPage(url.toString(), "UTF-8", cookies);

    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = factory.newDocumentBuilder();
    InputSource is = new InputSource(new StringReader(page));
    Document d = builder.parse(is);
    Element doc = d.getDocumentElement();

    NodeList pages = doc.getElementsByTagName("page");
    total = pages.getLength();
    for (int i = 0; i < pages.getLength(); i++) {
      Element e = (Element) pages.item(i);
      ret.add(e.getAttribute("path"));
    }

    return (ret);
  }
Ejemplo n.º 4
0
  public TTGlyph toSimpleGlyph() {
    // convert the file into array of contours
    XContour[] contours = toContours();
    if ((contours == null) && (!isRequiredGlyph())) {
      return null;
    } // if

    TTGlyph retval = new TTGlyph();
    retval.setSimple(true);
    retval.setAdvanceWidth(getAdvanceWidth());

    if (contours == null) {
      return retval;
    } // if

    ArrayList<EContourPoint> points = new ArrayList<>();
    for (int i = 0; i < contours.length; i++) {
      XContour contour = contours[i];
      XContourPoint[] contourPoints = contour.getContourPoint();
      for (int j = 0; j < contourPoints.length; j++) {
        points.add((EContourPoint) contourPoints[j]);
      } // for j
      retval.addEndPoint(points.size() - 1);
    } // for i

    for (EContourPoint point : points) {
      loadContourPoint(retval, point);
    } // for point

    boolean hasGridfit = false;
    // I need int i here.
    for (int i = 0; i < points.size(); i++) {
      EContourPoint point = points.get(i);

      if (!point.isRounded()) {
        continue;
      } // if

      hasGridfit = true;
      loadGridfit(retval, point, i);
    } // for i

    if (hasGridfit) {
      retval.addInstruction(TTGlyph.IUP1);
      retval.addInstruction(TTGlyph.IUP0);
    } // if

    // I need int i here.
    for (int i = 0; i < points.size(); i++) {
      EContourPoint point = points.get(i);
      if (point.getHint().length == 0) {
        continue;
      } // if

      loadHint(retval, point, i);
    } // for i

    return retval;
  }
  private QueryResult gatherResultInfoForSelectQuery(
      String queryString, int queryNr, boolean sorted, Document doc, String[] rows) {
    Element root = doc.getRootElement();

    // Get head information
    Element child =
        root.getChild("head", Namespace.getNamespace("http://www.w3.org/2005/sparql-results#"));

    // Get result rows (<head>)
    List headChildren =
        child.getChildren(
            "variable", Namespace.getNamespace("http://www.w3.org/2005/sparql-results#"));

    Iterator it = headChildren.iterator();
    ArrayList<String> headList = new ArrayList<String>();
    while (it.hasNext()) {
      headList.add(((Element) it.next()).getAttributeValue("name"));
    }

    List resultChildren =
        root.getChild("results", Namespace.getNamespace("http://www.w3.org/2005/sparql-results#"))
            .getChildren(
                "result", Namespace.getNamespace("http://www.w3.org/2005/sparql-results#"));
    int nrResults = resultChildren.size();

    QueryResult queryResult = new QueryResult(queryNr, queryString, nrResults, sorted, headList);

    it = resultChildren.iterator();
    while (it.hasNext()) {
      Element resultElement = (Element) it.next();
      String result = "";

      // get the row values and paste it together to one String
      for (int i = 0; i < rows.length; i++) {
        List bindings =
            resultElement.getChildren(
                "binding", Namespace.getNamespace("http://www.w3.org/2005/sparql-results#"));
        String rowName = rows[i];
        for (int j = 0; j < bindings.size(); j++) {
          Element binding = (Element) bindings.get(j);
          if (binding.getAttributeValue("name").equals(rowName))
            if (result.equals(""))
              result +=
                  rowName + ": " + ((Element) binding.getChildren().get(0)).getTextNormalize();
            else
              result +=
                  "\n"
                      + rowName
                      + ": "
                      + ((Element) binding.getChildren().get(0)).getTextNormalize();
        }
      }

      queryResult.addResult(result);
    }
    return queryResult;
  }
Ejemplo n.º 6
0
  public void endElement(String uri, String sName, String qName) {

    if (qName.equals("PROPSTABLE")) {
    } else if (qName.equals("PROPSROW")) {
      if (PROPSROW_DESC == null
          || (curPROPSROW_DESC != null && curPROPSROW_DESC.equals(PROPSROW_DESC)))
        propsAl.add(curProps);
    } else {
      // LogUtil.fine("/"+qName);
      textFlag = false;
    }
  }
Ejemplo n.º 7
0
 /**
  * Populates LOCALES list with contents of xml.
  *
  * @param list the configuration list
  */
 private static void parseLocales(NodeList list) {
   for (int i = 0; i < list.getLength(); ++i) {
     Node node = list.item(i);
     NamedNodeMap attributes = node.getAttributes();
     String label = ((Attr) attributes.getNamedItem("label")).getValue();
     String code = ((Attr) attributes.getNamedItem("isoCode")).getValue();
     String dictLocation = ((Attr) attributes.getNamedItem("dictionaryUrl")).getValue();
     try {
       LOCALES.add(new Locale(label, code, new URL(dictLocation)));
     } catch (MalformedURLException exc) {
       logger.warn(
           "Unable to parse dictionary location of " + label + " (" + dictLocation + ")", exc);
     }
   }
 }
  // Add/Insert
  public int add(Script script) {
    // Find the correct spot to add it alphabetically
    int i, limit;
    for (i = 0, limit = scripts.size(); i < limit; i++) {
      Script scriptTemp = (Script) scripts.get(i);
      if (scriptTemp.getName().compareTo(script.getName()) >= 0) {
        break;
      }
    }

    scripts.add(i, script);

    // Update the table
    fireTableRowsInserted(i, i);

    return i;
  }
Ejemplo n.º 9
0
    public void endElement(String uri, String localName, String qname) {
      super.endElement(uri, localName, qname);

      if (record != null && localName.equals(DataRecord.ENCODING_RECORD)) {
        // end of record
        if (dataImport.importRecord(record, fieldsInImport)) {
          count++;
        }
        record = null;
        fieldsInImport = null;
      }
      String fieldValue = getFieldValue();
      if (record != null && fieldValue != null) {
        // end of field
        try {
          if (textImport) {
            if (!record.setFromText(fieldName, fieldValue.trim())) {
              dataImport.logImportWarning(
                  record,
                  "Value '"
                      + fieldValue
                      + "' for Field '"
                      + fieldName
                      + "' corrected to '"
                      + record.getAsText(fieldName)
                      + "'");
            }
          } else {
            record.set(fieldName, fieldValue.trim());
          }
          String[] equivFields = record.getEquivalentFields(fieldName);
          for (String f : equivFields) {
            fieldsInImport.add(f);
          }
        } catch (Exception esetvalue) {
          dataImport.logImportWarning(
              record,
              "Cannot set value '"
                  + fieldValue
                  + "' for Field '"
                  + fieldName
                  + "': "
                  + esetvalue.toString());
        }
      }
    }
Ejemplo n.º 10
0
 /**
  * this is fired when a tag start event is found on an xml document
  *
  * @param uri namespace for tag being processed
  * @param localName tag name
  * @param qName fully qualified name for tag
  * @param attributes tag attributes
  * @throws SAXException if parsing fails
  */
 @Override
 public void startElement(String uri, String localName, String qName, Attributes attributes)
     throws SAXException {
   // if a class is being parsed
   if (qName.equalsIgnoreCase(getTagName())) {
     // set class name from xml attributes
     mClassName = attributes.getValue(TAG_ATTRIBUTES.name.toString());
     // if an attribute is being parsed
   } else if (qName.equalsIgnoreCase(AttributeDescriptor.TAG_NAME)) {
     // create an attribute descriptor
     AttributeDescriptor attribute = new AttributeDescriptor();
     // add an attribute to this class
     mAttributes.add(attribute);
     // let the attribute parse itself
     mCtx.pushHandler(attribute);
     // forward event to new handler
     attribute.startElement(uri, localName, qName, attributes);
   }
 }
Ejemplo n.º 11
0
 public int runCsvImport() {
   int count = 0;
   try {
     int linecnt = 0;
     String[] header = null;
     ArrayList<String> fieldsInImport = new ArrayList<String>();
     BufferedReader f =
         new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));
     String s;
     DataRecord dummyRecord = storageObject.createNewRecord();
     while ((s = f.readLine()) != null) {
       s = s.trim();
       if (s.length() == 0) {
         continue;
       }
       Vector<String> fields = splitFields(s);
       if (fields.size() > 0) {
         if (linecnt == 0) {
           // header
           header = new String[fields.size()];
           for (int i = 0; i < fields.size(); i++) {
             header[i] = fields.get(i);
             if (header[i].startsWith("#") && header[i].endsWith("#") && header.length > 2) {
               header[i] = header[i].substring(1, header[i].length() - 1).trim();
               overrideKeyField = header[i];
             }
             String[] equivFields = dummyRecord.getEquivalentFields(header[i]);
             for (String ef : equivFields) {
               fieldsInImport.add(ef);
             }
           }
         } else {
           // fields
           DataRecord r = storageObject.createNewRecord();
           for (int i = 0; i < header.length; i++) {
             String value = (fields.size() > i ? fields.get(i) : null);
             if (value != null && value.length() > 0) {
               try {
                 if (!r.setFromText(header[i], value.trim())) {
                   logImportWarning(
                       r,
                       "Value '"
                           + value
                           + "' for Field '"
                           + header[i]
                           + "' corrected to '"
                           + r.getAsText(header[i])
                           + "'");
                 }
               } catch (Exception esetvalue) {
                 logImportWarning(
                     r,
                     "Cannot set value '"
                         + value
                         + "' for Field '"
                         + header[i]
                         + "': "
                         + esetvalue.toString());
               }
             }
           }
           if (importRecord(r, fieldsInImport)) {
             count++;
           }
         }
       }
       linecnt++;
     }
     f.close();
   } catch (Exception e) {
     logInfo(e.toString());
     errorCount++;
     Logger.log(e);
     if (Daten.isGuiAppl()) {
       Dialog.error(e.toString());
     }
   }
   return count;
 }
Ejemplo n.º 12
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    if (qName.equals("date")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      String refString = getRefText();
      refString = refString.replace("\n", " ");
      refString = refString.replace("\t", " ");
      refString = refString.replace("  ", " ");

      if (npl && ref) {
        if (referencesNPL == null) referencesNPL = new ArrayList<String>();
        referencesNPL.add(refString);
        refFound = true;
        if (nplReferences) nbNPLRef++;
      } else if (ref) {
        if (referencesPatent == null) {
          referencesPatent = new HashMap<String, ArrayList<String>>();
        }
        ArrayList<String> refss = referencesPatent.get(currentFileName);

        if (refss == null) {
          refss = new ArrayList<String>();
        }

        refss.add(refString);
        referencesPatent.put(currentFileName, refss);
        refFound = true;
        if (patentReferences) {
          nbPatentRef++;
        }
      }

      if (refFound) {
        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(refString,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(refString, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(refString);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          try {
            accumulatedText.append(token + "\t");
            allContent.append(token + " ");
            if (npl) {
              if (nplReferences) {
                if (i == 0) {
                  // accumulatedText.append("refNPLBegin\n");
                  accumulatedText.append("I-<refNPL>\n");
                } else if (token == null) {
                  // accumulatedText.append("refNPLEnd\n");
                  accumulatedText.append("E-<refNPL>\n");
                } else {
                  accumulatedText.append("<refNPL>\n");
                }
              } else accumulatedText.append("<other>\n");
            } else {
              if (patentReferences) {
                if (i == 0) accumulatedText.append("I-<refPatent>\n");
                else if (token == null) accumulatedText.append("E-<refPatent>\n");
                else accumulatedText.append("<refPatent>\n");
              } else accumulatedText.append("<other>\n");
            }
          } catch (Exception e) {
            //						e.printStackTrace();
            throw new GrobidException("An exception occured while running Grobid.", e);
          }
          i++;
        }
      }
      ref = false;
    } else if (qName.equals("classification-ipcr")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("abstract")) {
      accumulator.setLength(0);
    } else if (qName.equals("heading")) {
      accumulator.append(" ");
    } else if (qName.equals("description")) {
      if (refFound) {
        String content = getText();

        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(content,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(content, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(content);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          // we print only a window of N words
          if ((i > N) && (N != -1)) {
            // break;
            token = token.trim();
            if (token.length() > 0) {
              accumulatedText.append(token + "\t" + "<ignore>\n");
              allContent.append(token + " ");
            }
          } else {
            try {
              token = token.trim();
              if (token.length() > 0) {
                accumulatedText.append(token + "\t" + "<other>\n");
                allContent.append(token + " ");
              }
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);
            }
          }
          i++;
        }

        accumulator.setLength(0);
        refFound = false;
      }
    } else if (qName.equals("patcit")) {
      // we register the citation, the citation context will be marked in a later stage
      if (citations == null) citations = new ArrayList<String>();
      citations.add(cited_number);
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("applicants")) {
      accumulator.setLength(0);
    } else if (qName.equals("inventors")) {
      accumulator.setLength(0);
    } else if (qName.equals("document-id")) {
      accumulator.setLength(0);
    } else if (qName.equals("legal-status")) {
      accumulator.setLength(0);
    } else if (qName.equals("bibliographic-data")) {
      accumulator.setLength(0);
    } else if (qName.equals("doc-number")) {
      accumulator.setLength(0);
    } else if (qName.equals("country")) {
      accumulator.setLength(0);
    } else if (qName.equals("kind")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-ecla")) {
      accumulator.setLength(0);
    } else if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      String allString = allContent.toString();
      journalsPositions = lexicon.inJournalNames(allString);
      abbrevJournalsPositions = lexicon.inAbbrevJournalNames(allString);
      conferencesPositions = lexicon.inConferenceNames(allString);
      publishersPositions = lexicon.inPublisherNames(allString);
      allContent = null;
      allString = null;
    } else if (qName.equals("row")) {
      accumulator.append(" ");
    } else if (qName.equals("p")) {
      accumulator.append("\n");
    }
  }
Ejemplo n.º 13
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("PAGE")) {
      int length = atts.getLength();
      currentPage++;

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("number")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }

      /*
       * if (block != null) { blabla.append("\n");
       * tokenizations.add("\n"); block.setText(blabla.toString());
       * block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0
       * = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0);
       * doc.addBlock(block0);
       */
      /*
       * block = new Block(); blabla = new StringBuffer(); nbTokens = 0;
       * //blabla.append("\n@block\n"); tokenizations.add("\n");
       */
    } else if (qName.equals("BLOCK")) {
      block = new Block();
      blabla = new StringBuffer();
      nbTokens = 0;
      block.setPage(currentPage);
      // blabla.append("\n@block\n");
    } else if (qName.equals("IMAGE")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }

    } else if (qName.equals("TEXT")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {

          } else if (name.equals("x")) {

          } else if (name.equals("y")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }
    } else if (qName.equals("TOKEN")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("font-name")) {
            if (!value.equals(currentFont)) {
              currentFont = value;
              blabla.append(" ");
            }
          } else if (name.equals("font-size")) {
            double fontSize = Double.parseDouble(value);
            if (fontSize != currentFontSize) {
              currentFontSize = fontSize;

              blabla.append(" ");
            }
          } else if (name.equals("bold")) {
            if (value.equals("yes")) {
              currentBold = true;
            } else {
              currentBold = false;
            }
          } else if (name.equals("italic")) {
            if (value.equals("yes")) {
              currentItalic = true;
            } else {
              currentItalic = false;
            }
          } else if (name.equals("font-color")) {
            if (!value.equals(colorFont)) {
              colorFont = value;
            }
          } else if (name.equals("rotation")) {
            if (value.equals("0")) currentRotation = false;
            else currentRotation = true;
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("base")) {
            double base = Double.parseDouble(value);

          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }
    } else if (qName.equals("xi:include")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          }
        }
      }
    }
    // accumulator.setLength(0);
  }
Ejemplo n.º 14
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }