예제 #1
0
  /**
   * Extract results from a labelled full text in the training format without any string
   * modification.
   *
   * @param result reult
   * @param tokenizations toks
   * @return extraction
   */
  private StringBuffer trainingExtraction(String result, ArrayList<String> tokenizations) {
    // this is the main buffer for the whole full text
    StringBuffer buffer = new StringBuffer();
    try {
      StringTokenizer st = new StringTokenizer(result, "\n");
      String s1 = null;
      String s2 = null;
      String lastTag = null;

      // current token position
      int p = 0;
      boolean start = true;
      boolean openFigure = false;
      boolean headFigure = false;
      boolean descFigure = false;
      boolean tableBlock = false;

      while (st.hasMoreTokens()) {
        boolean addSpace = false;
        String tok = st.nextToken().trim();

        if (tok.length() == 0) {
          continue;
        }
        StringTokenizer stt = new StringTokenizer(tok, " \t");
        ArrayList<String> localFeatures = new ArrayList<String>();
        int i = 0;

        boolean newLine = false;
        int ll = stt.countTokens();
        while (stt.hasMoreTokens()) {
          String s = stt.nextToken().trim();
          if (i == 0) {
            s2 = TextUtilities.HTMLEncode(s); // lexical token

            boolean strop = false;
            while ((!strop) && (p < tokenizations.size())) {
              String tokOriginal = tokenizations.get(p);
              if (tokOriginal.equals(" ")) {
                addSpace = true;
              } else if (tokOriginal.equals(s)) {
                strop = true;
              }
              p++;
            }
          } else if (i == ll - 1) {
            s1 = s; // current tag
          } else {
            if (s.equals("LINESTART")) newLine = true;
            localFeatures.add(s);
          }
          i++;
        }

        if (newLine && !start) {
          buffer.append("<lb/>");
        }

        String lastTag0 = null;
        if (lastTag != null) {
          if (lastTag.startsWith("I-")) {
            lastTag0 = lastTag.substring(2, lastTag.length());
          } else {
            lastTag0 = lastTag;
          }
        }
        String currentTag0 = null;
        if (s1 != null) {
          if (s1.startsWith("I-")) {
            currentTag0 = s1.substring(2, s1.length());
          } else {
            currentTag0 = s1;
          }
        }

        boolean closeParagraph = false;
        if (lastTag != null) {
          closeParagraph = testClosingTag(buffer, currentTag0, lastTag0, s1);
        }

        boolean output = false;

        if (!currentTag0.equals("<table>")
            && !currentTag0.equals("<trash>")
            && !currentTag0.equals("<figure_head>")
            && !currentTag0.equals("<label>")) {
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
          openFigure = false;
          headFigure = false;
          descFigure = false;
          tableBlock = false;
        }

        output = writeField(buffer, s1, lastTag0, s2, "<header>", "<front>", addSpace, 3);
        if (!output) {
          output =
              writeField(buffer, s1, lastTag0, s2, "<other>", "<note type=\"other\">", addSpace, 3);
        }
        // for paragraph we must distinguish starting and closing tags
        if (!output) {
          if (closeParagraph) {
            output = writeFieldBeginEnd(buffer, s1, "", s2, "<paragraph>", "<p>", addSpace, 3);
          } else {
            output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<paragraph>", "<p>", addSpace, 3);
          }
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_header>",
                  "<note place=\"headnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_footnote>",
                  "<note place=\"footnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<page>", "<page>", addSpace, 3);
        }
        if (!output) {
          output =
              writeFieldBeginEnd(buffer, s1, lastTag0, s2, "<reference>", "<bibl>", addSpace, 3);
        }
        if (!output) {
          if (closeParagraph) {
            output = writeField(buffer, s1, "", s2, "<reference_marker>", "<label>", addSpace, 3);
          } else
            output =
                writeField(buffer, s1, lastTag0, s2, "<reference_marker>", "<label>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<citation_marker>",
                  "<ref type=\"biblio\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<section>", "<head>", addSpace, 3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<subsection>", "<head>", addSpace, 3);
        }
        if (!output) {
          if (openFigure) {
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 4);
          } else {
            // output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<figure>\n\t\t\t\t<trash>",
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 3);
            if (output) {
              openFigure = true;
            }
          }
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<equation>", "<formula>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<figure_marker>",
                  "<ref type=\"figure\">",
                  addSpace,
                  3);
        }
        if (!output) {
          if (openFigure) {
            if (tableBlock && (!lastTag0.equals("<table>")) && (currentTag0.equals("<table>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure>\n\t\t\t\t<table>",
                      "<figure>",
                      addSpace,
                      3);
              if (output) {
                tableBlock = true;
                descFigure = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<table>", "<table>", addSpace, 4);
              if (output) {
                tableBlock = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer, s1, lastTag0, s2, "<table>", "<figure>\n\t\t\t\t<table>", addSpace, 3);
            if (output) {
              openFigure = true;
              tableBlock = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (descFigure && (!lastTag0.equals("<label>")) && (currentTag0.equals("<label>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<label>",
                      "<figure>\n\t\t\t\t<figDesc>",
                      addSpace,
                      3);
              if (output) {
                descFigure = true;
                tableBlock = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<label>", "<figDesc>", addSpace, 4);
              if (output) {
                descFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<label>",
                    "<figure>\n\t\t\t\t<figDesc>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              descFigure = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (headFigure
                && (!lastTag0.equals("<figure_head>"))
                && (currentTag0.equals("<figure_head>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure_head>",
                      "<figure>\n\t\t\t\t<head>",
                      addSpace,
                      3);
              if (output) {
                descFigure = false;
                tableBlock = false;
                headFigure = true;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<figure_head>", "<head>", addSpace, 4);
              if (output) {
                headFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<figure_head>",
                    "<figure>\n\t\t\t\t<head>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              headFigure = true;
            }
          }
        }
        // for item we must distinguish starting and closing tags
        if (!output) {
          output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<item>", "<item>", addSpace, 3);
        }

        lastTag = s1;

        if (!st.hasMoreTokens()) {
          if (lastTag != null) {
            testClosingTag(buffer, "", currentTag0, s1);
          }
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
        }
        if (start) {
          start = false;
        }
      }

      return buffer;
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }