Esempio n. 1
0
  /**
   * Create the set of training and evaluation sets from the annotated examples with extraction of
   * citations in the patent description body.
   *
   * @param rank rank associated to the set for n-fold data generation
   * @param type type of data to be created, 0 is training data, 1 is evaluation data
   */
  public void createDataSet(
      String setName, String rank, String corpusPath, String outputPath, int type) {
    int nbFiles = 0;
    int nbNPLRef = 0;
    int nbPatentRef = 0;
    int maxRef = 0;
    try {
      // PATENT REF. textual data
      // we use a SAX parser on the patent XML files
      MarecSaxParser sax = new MarecSaxParser();
      sax.patentReferences = true;
      sax.nplReferences = false;
      int srCitations = 0;
      int previousSrCitations = 0;
      int withSR = 0;

      List<OffsetPosition> journalsPositions = null;
      List<OffsetPosition> abbrevJournalsPositions = null;
      List<OffsetPosition> conferencesPositions = null;
      List<OffsetPosition> publishersPositions = null;

      if (type == 0) {
        // training set
        sax.setN(trainWindow);
      } else {
        // for the test set we enlarge the focus window to include all the document.
        sax.setN(-1);
      }
      // get a factory
      /*SAXParserFactory spf = SAXParserFactory.newInstance();
      spf.setValidating(false);
      spf.setFeature("http://xml.org/sax/features/namespaces", false);
      spf.setFeature("http://xml.org/sax/features/validation", false);

      LinkedList<File> fileList = new LinkedList<File>();
      if (setName == null) {
          fileList.add(new File(corpusPath));
      } else if (rank == null) {
          fileList.add(new File(corpusPath));
      } else {
          // n-fold evaluation
          fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
      }
      Writer writer = null;
      if ((setName == null) || (setName.length() == 0)) {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + "/patent.train"), false), "UTF-8");
      } else if (rank == null) {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + "/patent." + setName), false), "UTF-8");
      } else {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + setName + "ing" + rank + "/patent." + setName), false), "UTF-8");
      }

      while (fileList.size() > 0) {
          File file = fileList.removeFirst();
          if (file.isDirectory()) {
              for (File subFile : file.listFiles())
                  fileList.addLast(subFile);
          } else {
              if (file.getName().endsWith(".xml")) {
                  nbFiles++;
                  System.out.println(file.getAbsolutePath());
                  try {
                      //get a new instance of parser
                      SAXParser p = spf.newSAXParser();
                      FileInputStream in = new FileInputStream(file);
                      sax.setFileName(file.getName());
                      p.parse(in, sax);
                      //writer1.write("\n");
                      nbPatentRef += sax.getNbPatentRef();
                      if (sax.citations != null) {
                          if (sax.citations.size() > previousSrCitations) {
                              previousSrCitations = sax.citations.size();
                              withSR++;
                          }
                      }
                      journalsPositions = sax.journalsPositions;
                      abbrevJournalsPositions = sax.abbrevJournalsPositions;
                      conferencesPositions = sax.conferencesPositions;
                      publishersPositions = sax.publishersPositions;

                      if (sax.accumulatedText != null) {
                          String text = sax.accumulatedText.toString();
                          if (text.trim().length() > 0) {
                              // add features for the patent tokens
                              addFeatures(text,
                                      writer,
                                      journalsPositions,
                                      abbrevJournalsPositions,
                                      conferencesPositions,
                                      publishersPositions);
                              writer.write("\n \n");
                          }
                      }
                  } catch (Exception e) {
                      throw new GrobidException("An exception occured while running Grobid.", e);
                  }
              }
          }
      }*/

      // NPL REF. textual data
      /*sax = new MarecSaxParser();
               sax.patentReferences = false;
               sax.nplReferences = true;

      if (type == 0) {
      	// training set
      	sax.setN(trainWindow);
      }
               else {
      	// for the test set we enlarge the focus window to include all the document.
                	sax.setN(-1);
              	}
               // get a factory
               spf = SAXParserFactory.newInstance();
               spf.setValidating(false);
               spf.setFeature("http://xml.org/sax/features/namespaces", false);
               spf.setFeature("http://xml.org/sax/features/validation", false);

               fileList = new LinkedList<File>();
               if (setName == null) {
                   fileList.add(new File(corpusPath));
               } else if (rank == null) {
                   fileList.add(new File(corpusPath));
               } else {
                   fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
               }
               if ((setName == null) || (setName.length() == 0)) {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + "/npl.train"), false), "UTF-8");
               } else if (rank == null) {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + "/npl." + setName), false), "UTF-8");
               } else {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + File.separator + setName + "ing" + rank + File.separator +
      			"npl." + setName), false), "UTF-8");
               }
               while (fileList.size() > 0) {
                   File file = fileList.removeFirst();
                   if (file.isDirectory()) {
                       for (File subFile : file.listFiles())
                           fileList.addLast(subFile);
                   } else {
                       if (file.getName().endsWith(".xml")) {
                           //nbFiles++;
                           //String text = Files.readFromFile(file,"UTF-8");

                           try {
                               //get a new instance of parser
                               SAXParser p = spf.newSAXParser();
                               FileInputStream in = new FileInputStream(file);
                               sax.setFileName(file.toString());
                               p.parse(in, sax);
                               //writer2.write("\n");
                               nbNPLRef += sax.getNbNPLRef();
                               if (sax.nbAllRef > maxRef) {
                                   maxRef = sax.nbAllRef;
                               }
                               if (sax.citations != null) {
                                   if (sax.citations.size() > previousSrCitations) {
                                       previousSrCitations = sax.citations.size();
                                       withSR++;
                                   }
                               }
                               journalsPositions = sax.journalsPositions;
                               abbrevJournalsPositions = sax.abbrevJournalsPositions;
                               conferencesPositions = sax.conferencesPositions;
                               publishersPositions = sax.publishersPositions;
                               //totalLength += sax.totalLength;

                               if (sax.accumulatedText != null) {
                                   String text = sax.accumulatedText.toString();
                                   // add features for NPL
                                   addFeatures(text,
                                           writer,
                                           journalsPositions,
                                           abbrevJournalsPositions,
                                           conferencesPositions,
                                           publishersPositions);
                                   writer.write("\n");
                               }

                           } catch (Exception e) {
                               throw new GrobidException("An exception occured while running Grobid.", e);
                           }
                       }
                   }
               }

               if (sax.citations != null)
                   srCitations += sax.citations.size();*/

      // Patent + NPL REF. textual data (the "all" model)
      sax = new MarecSaxParser();
      sax.patentReferences = true;
      sax.nplReferences = true;

      if (type == 0) {
        // training set
        sax.setN(trainWindow);
      } else {
        // for the test set we enlarge the focus window to include all the document.
        sax.setN(-1);
      }
      // get a factory
      SAXParserFactory spf = SAXParserFactory.newInstance();
      spf.setValidating(false);
      spf.setFeature("http://xml.org/sax/features/namespaces", false);
      spf.setFeature("http://xml.org/sax/features/validation", false);

      LinkedList<File> fileList = new LinkedList<File>();
      if (setName == null) {
        fileList.add(new File(corpusPath));
      } else if (rank == null) {
        fileList.add(new File(corpusPath));
      } else {
        fileList.add(
            new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
      }

      Writer writer = null;
      if ((setName == null) || (setName.length() == 0)) {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(new File(outputPath + File.separator + "all.train"), false),
                "UTF-8");
      } else if (rank == null) {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(
                    new File(outputPath + File.separator + "all." + setName), false),
                "UTF-8");
      } else {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(
                    new File(
                        outputPath
                            + File.separator
                            + setName
                            + "ing"
                            + rank
                            + File.separator
                            + "all."
                            + setName),
                    false),
                "UTF-8");
      }
      // int totalLength = 0;
      while (fileList.size() > 0) {
        File file = fileList.removeFirst();
        if (file.isDirectory()) {
          for (File subFile : file.listFiles()) {
            fileList.addLast(subFile);
          }
        } else {
          if (file.getName().endsWith(".xml")) {
            nbFiles++;
            try {
              // get a new instance of parser
              SAXParser p = spf.newSAXParser();
              FileInputStream in = new FileInputStream(file);
              sax.setFileName(file.toString());
              p.parse(in, sax);
              // writer3.write("\n");
              nbNPLRef += sax.getNbNPLRef();
              nbPatentRef += sax.getNbPatentRef();
              if (sax.nbAllRef > maxRef) {
                maxRef = sax.nbAllRef;
              }
              if (sax.citations != null) {
                if (sax.citations.size() > previousSrCitations) {
                  previousSrCitations = sax.citations.size();
                  withSR++;
                }
              }
              journalsPositions = sax.journalsPositions;
              abbrevJournalsPositions = sax.abbrevJournalsPositions;
              conferencesPositions = sax.conferencesPositions;
              publishersPositions = sax.publishersPositions;
              // totalLength += sax.totalLength;

              if (sax.accumulatedText != null) {
                String text = sax.accumulatedText.toString();
                // add features for patent+NPL
                addFeatures(
                    text,
                    writer,
                    journalsPositions,
                    abbrevJournalsPositions,
                    conferencesPositions,
                    publishersPositions);
                writer.write("\n");
              }
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);
            }
          }
        }
      }

      if (sax.citations != null) {
        srCitations += sax.citations.size();
      }
      if (setName != null) {
        System.out.println(setName + "ing on " + nbFiles + " files");
      } else {
        System.out.println("training on " + nbFiles + " files");
      }
      // System.out.println("Number of file with search report: " + withSR);
      System.out.println("Number of references: " + (nbNPLRef + nbPatentRef));
      System.out.println("Number of patent references: " + nbPatentRef);
      System.out.println("Number of NPL references: " + nbNPLRef);
      // System.out.println("Number of search report citations: " + srCitations);
      System.out.println(
          "Average number of references: "
              + TextUtilities.formatTwoDecimals((double) (nbNPLRef + nbPatentRef) / nbFiles));
      System.out.println("Max number of references in file: " + maxRef);

      /*if ((setName == null) || (setName.length() == 0)) {
          System.out.println("patent data set under: " + outputPath + "/patent.train");
      } else {
          System.out.println("patent data set under: " + outputPath + "/patent." + setName);
      }
      if ((setName == null) || (setName.length() == 0)) {
          System.out.println("npl data set under: " + outputPath + "/npl.train");
      } else {
          System.out.println("npl data set under: " + outputPath + "/npl." + setName);
      }*/
      if ((setName == null) || (setName.length() == 0)) {
        System.out.println("common data set under: " + outputPath + "/all.train");
      } else {
        System.out.println("common data set under: " + outputPath + "/all." + setName);
      }
    } catch (Exception e) {
      throw new GrobidException("An exception occurred while running Grobid.", e);
    }
  }
Esempio n. 2
0
  /**
   * Extract results from a labelled full text in the training format without any string
   * modification.
   *
   * @param result reult
   * @param tokenizations toks
   * @return extraction
   */
  private StringBuffer trainingExtraction(String result, ArrayList<String> tokenizations) {
    // this is the main buffer for the whole full text
    StringBuffer buffer = new StringBuffer();
    try {
      StringTokenizer st = new StringTokenizer(result, "\n");
      String s1 = null;
      String s2 = null;
      String lastTag = null;

      // current token position
      int p = 0;
      boolean start = true;
      boolean openFigure = false;
      boolean headFigure = false;
      boolean descFigure = false;
      boolean tableBlock = false;

      while (st.hasMoreTokens()) {
        boolean addSpace = false;
        String tok = st.nextToken().trim();

        if (tok.length() == 0) {
          continue;
        }
        StringTokenizer stt = new StringTokenizer(tok, " \t");
        ArrayList<String> localFeatures = new ArrayList<String>();
        int i = 0;

        boolean newLine = false;
        int ll = stt.countTokens();
        while (stt.hasMoreTokens()) {
          String s = stt.nextToken().trim();
          if (i == 0) {
            s2 = TextUtilities.HTMLEncode(s); // lexical token

            boolean strop = false;
            while ((!strop) && (p < tokenizations.size())) {
              String tokOriginal = tokenizations.get(p);
              if (tokOriginal.equals(" ")) {
                addSpace = true;
              } else if (tokOriginal.equals(s)) {
                strop = true;
              }
              p++;
            }
          } else if (i == ll - 1) {
            s1 = s; // current tag
          } else {
            if (s.equals("LINESTART")) newLine = true;
            localFeatures.add(s);
          }
          i++;
        }

        if (newLine && !start) {
          buffer.append("<lb/>");
        }

        String lastTag0 = null;
        if (lastTag != null) {
          if (lastTag.startsWith("I-")) {
            lastTag0 = lastTag.substring(2, lastTag.length());
          } else {
            lastTag0 = lastTag;
          }
        }
        String currentTag0 = null;
        if (s1 != null) {
          if (s1.startsWith("I-")) {
            currentTag0 = s1.substring(2, s1.length());
          } else {
            currentTag0 = s1;
          }
        }

        boolean closeParagraph = false;
        if (lastTag != null) {
          closeParagraph = testClosingTag(buffer, currentTag0, lastTag0, s1);
        }

        boolean output = false;

        if (!currentTag0.equals("<table>")
            && !currentTag0.equals("<trash>")
            && !currentTag0.equals("<figure_head>")
            && !currentTag0.equals("<label>")) {
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
          openFigure = false;
          headFigure = false;
          descFigure = false;
          tableBlock = false;
        }

        output = writeField(buffer, s1, lastTag0, s2, "<header>", "<front>", addSpace, 3);
        if (!output) {
          output =
              writeField(buffer, s1, lastTag0, s2, "<other>", "<note type=\"other\">", addSpace, 3);
        }
        // for paragraph we must distinguish starting and closing tags
        if (!output) {
          if (closeParagraph) {
            output = writeFieldBeginEnd(buffer, s1, "", s2, "<paragraph>", "<p>", addSpace, 3);
          } else {
            output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<paragraph>", "<p>", addSpace, 3);
          }
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_header>",
                  "<note place=\"headnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_footnote>",
                  "<note place=\"footnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<page>", "<page>", addSpace, 3);
        }
        if (!output) {
          output =
              writeFieldBeginEnd(buffer, s1, lastTag0, s2, "<reference>", "<bibl>", addSpace, 3);
        }
        if (!output) {
          if (closeParagraph) {
            output = writeField(buffer, s1, "", s2, "<reference_marker>", "<label>", addSpace, 3);
          } else
            output =
                writeField(buffer, s1, lastTag0, s2, "<reference_marker>", "<label>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<citation_marker>",
                  "<ref type=\"biblio\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<section>", "<head>", addSpace, 3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<subsection>", "<head>", addSpace, 3);
        }
        if (!output) {
          if (openFigure) {
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 4);
          } else {
            // output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<figure>\n\t\t\t\t<trash>",
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 3);
            if (output) {
              openFigure = true;
            }
          }
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<equation>", "<formula>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<figure_marker>",
                  "<ref type=\"figure\">",
                  addSpace,
                  3);
        }
        if (!output) {
          if (openFigure) {
            if (tableBlock && (!lastTag0.equals("<table>")) && (currentTag0.equals("<table>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure>\n\t\t\t\t<table>",
                      "<figure>",
                      addSpace,
                      3);
              if (output) {
                tableBlock = true;
                descFigure = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<table>", "<table>", addSpace, 4);
              if (output) {
                tableBlock = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer, s1, lastTag0, s2, "<table>", "<figure>\n\t\t\t\t<table>", addSpace, 3);
            if (output) {
              openFigure = true;
              tableBlock = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (descFigure && (!lastTag0.equals("<label>")) && (currentTag0.equals("<label>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<label>",
                      "<figure>\n\t\t\t\t<figDesc>",
                      addSpace,
                      3);
              if (output) {
                descFigure = true;
                tableBlock = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<label>", "<figDesc>", addSpace, 4);
              if (output) {
                descFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<label>",
                    "<figure>\n\t\t\t\t<figDesc>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              descFigure = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (headFigure
                && (!lastTag0.equals("<figure_head>"))
                && (currentTag0.equals("<figure_head>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure_head>",
                      "<figure>\n\t\t\t\t<head>",
                      addSpace,
                      3);
              if (output) {
                descFigure = false;
                tableBlock = false;
                headFigure = true;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<figure_head>", "<head>", addSpace, 4);
              if (output) {
                headFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<figure_head>",
                    "<figure>\n\t\t\t\t<head>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              headFigure = true;
            }
          }
        }
        // for item we must distinguish starting and closing tags
        if (!output) {
          output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<item>", "<item>", addSpace, 3);
        }

        lastTag = s1;

        if (!st.hasMoreTokens()) {
          if (lastTag != null) {
            testClosingTag(buffer, "", currentTag0, s1);
          }
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
        }
        if (start) {
          start = false;
        }
      }

      return buffer;
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
  public static Document generalResultSegmentation(
      Document doc, String labeledResult, List<LayoutToken> documentTokens) {
    List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);

    SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create();
    doc.setLabeledBlocks(labeledBlocks);

    /*try {
          	FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult);
    	FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString());
    }
    catch(Exception e) {
    	e.printStackTrace();
    }*/

    List<Block> docBlocks = doc.getBlocks();
    int indexLine = 0;
    int blockIndex = 0;
    int p = 0; // position in the labeled result
    int currentLineEndPos = 0; // position in the global doc. tokenization of the last
    // token of the current line
    int currentLineStartPos = 0; // position in the global doc.
    // tokenization of the first token of the current line
    String line = null;

    DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
    DocumentPointer currentPointer = null;
    DocumentPointer lastPointer = null;

    String curLabel;
    String curPlainLabel = null;
    String lastPlainLabel = null;

    int lastTokenInd = -1;
    for (int i = docBlocks.size() - 1; i >= 0; i--) {
      int endToken = docBlocks.get(i).getEndToken();
      if (endToken != -1) {
        lastTokenInd = endToken;
        break;
      }
    }

    // we do this concatenation trick so that we don't have to process stuff after the main loop
    // no copying of lists happens because of this, so it's ok to concatenate
    String ignoredLabel = "@IGNORED_LABEL@";
    for (Pair<String, String> labeledTokenPair :
        Iterables.concat(
            labeledTokens,
            Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) {
      if (labeledTokenPair == null) {
        p++;
        continue;
      }

      // as we process the document segmentation line by line, we don't use the usual
      // tokenization to rebuild the text flow, but we get each line again from the
      // text stored in the document blocks (similarly as when generating the features)
      line = null;
      while ((line == null) && (blockIndex < docBlocks.size())) {
        Block block = docBlocks.get(blockIndex);
        List<LayoutToken> tokens = block.getTokens();
        String localText = block.getText();
        if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        }
        String[] lines = localText.split("[\\n\\r]");
        if ((lines.length == 0) || (indexLine >= lines.length)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        } else {
          line = lines[indexLine];
          indexLine++;
          if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) {
            line = null;
            continue;
          }

          if (currentLineStartPos > lastTokenInd) continue;

          // adjust the start token position in documentTokens to this non trivial line
          // first skip possible space characters and tabs at the beginning of the line
          while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                  || documentTokens.get(currentLineStartPos).t().equals("\t"))
              && (currentLineStartPos != lastTokenInd)) {
            currentLineStartPos++;
          }
          if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) {
            while (currentLineStartPos < block.getEndToken()) {
              if (documentTokens.get(currentLineStartPos).t().equals("\n")
                  || documentTokens.get(currentLineStartPos).t().equals("\r")) {
                // move to the start of the next line, but ignore space characters and tabs
                currentLineStartPos++;
                while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                        || documentTokens.get(currentLineStartPos).t().equals("\t"))
                    && (currentLineStartPos != lastTokenInd)) {
                  currentLineStartPos++;
                }
                if ((currentLineStartPos != lastTokenInd)
                    && labeledTokenPair.a.startsWith(
                        documentTokens.get(currentLineStartPos).getText())) {
                  break;
                }
              }
              currentLineStartPos++;
            }
          }

          // what is then the position of the last token of this line?
          currentLineEndPos = currentLineStartPos;
          while (currentLineEndPos < block.getEndToken()) {
            if (documentTokens.get(currentLineEndPos).t().equals("\n")
                || documentTokens.get(currentLineEndPos).t().equals("\r")) {
              currentLineEndPos--;
              break;
            }
            currentLineEndPos++;
          }
        }
      }
      curLabel = labeledTokenPair.b;
      curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel);

      /*System.out.println("-------------------------------");
      System.out.println("block: " + blockIndex);
      System.out.println("line: " + line);
      System.out.println("token: " + labeledTokenPair.a);
      System.out.println("curPlainLabel: " + curPlainLabel);
      System.out.println("lastPlainLabel: " + lastPlainLabel);
      if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1))
      	System.out.println("currentLineStartPos: " + currentLineStartPos +
      								" (" + documentTokens.get(currentLineStartPos) + ")");
      if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1))
      	System.out.println("currentLineEndPos: " + currentLineEndPos +
      								" (" + documentTokens.get(currentLineEndPos) + ")");*/

      if (blockIndex == docBlocks.size()) {
        break;
      }

      currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos);

      // either a new entity starts or a new beginning of the same type of entity
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
        }
        pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos);
        // System.out.println("add segment for: " + lastPlainLabel + ", until " +
        // (currentLineStartPos-2));
      }

      // updating stuff for next iteration
      lastPlainLabel = curPlainLabel;
      lastPointer = currentPointer;
      currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line
      p++;
    }

    if (blockIndex == docBlocks.size()) {
      // the last labelled piece has still to be added
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
          // System.out.println("add segment for: " + lastPlainLabel + ", until " +
          // (currentLineStartPos-2));
        }
      }
    }

    return doc;
  }
  /**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }
  public String printVector() {
    if (string == null) return null;
    if (string.length() == 0) return null;
    StringBuilder res = new StringBuilder();

    // token string (1)
    res.append(string);

    // lowercase string (1)
    res.append(" ").append(string.toLowerCase());

    // prefix (4)
    res.append(" " + TextUtilities.prefix(string, 1));
    res.append(" " + TextUtilities.prefix(string, 2));
    res.append(" " + TextUtilities.prefix(string, 3));
    res.append(" " + TextUtilities.prefix(string, 4));

    // suffix (4)
    res.append(" " + TextUtilities.suffix(string, 1));
    res.append(" " + TextUtilities.suffix(string, 2));
    res.append(" " + TextUtilities.suffix(string, 3));
    res.append(" " + TextUtilities.suffix(string, 4));

    // line information (1)
    res.append(" ").append(lineStatus);

    // capitalisation (1)
    if (digit.equals("ALLDIGIT")) res.append(" NOCAPS");
    else res.append(" ").append(capitalisation);

    // digit information (1)
    res.append(" ").append(digit);

    // character information (1)
    if (singleChar) res.append(" 1");
    else res.append(" 0");

    // lexical information (8)
    if (properName) res.append(" 1");
    else res.append(" 0");

    if (commonName) res.append(" 1");
    else res.append(" 0");

    if (firstName) res.append(" 1");
    else res.append(" 0");

    if (locationName) res.append(" 1");
    else res.append(" 0");

    if (year) res.append(" 1");
    else res.append(" 0");

    if (month) res.append(" 1");
    else res.append(" 0");

    if (email) res.append(" 1");
    else res.append(" 0");

    if (http) res.append(" 1");
    else res.append(" 0");

    // bibliographical information(4)
    if (isKnownJournalTitle || isKnownAbbrevJournalTitle) res.append(" 1");
    else res.append(" 0");

    if (isKnownConferenceTitle) res.append(" 1");
    else res.append(" 0");

    if (isKnownPublisher) res.append(" 1");
    else res.append(" 0");

    // punctuation information (1)
    res.append(" ").append(punctType); // in case the token is a punctuation (NO otherwise)

    // relative position in the sequence (1)
    res.append(" ").append(relativePosition);

    // label - for training data (1)
    if (label != null) res.append(" ").append(label).append("\n");
    else res.append(" 0\n");

    return res.toString();
  }
Esempio n. 6
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }