コード例 #1
0
ファイル: FullTextParser.java プロジェクト: snpts/grobid
/** @author Patrice Lopez */
public class FullTextParser extends AbstractParser {
  private static final Logger LOGGER = LoggerFactory.getLogger(FullTextParser.class);

  private LanguageUtilities languageUtilities = LanguageUtilities.getInstance();

  private HeaderParser headerParser = null;
  private CitationParser citationParser = null;
  //	private String tmpPathName = null;
  private Document doc = null;
  private File tmpPath = null;
  private String pathXML = null;
  private BiblioItem resHeader = null;

  /** TODO some documentation... */
  public FullTextParser() {
    super(GrobidModels.FULLTEXT);
    tmpPath = GrobidProperties.getInstance().getTempPath();
  }

  /**
   * TODO some documentation...
   *
   * @param input filename of pdf file
   * @param consolidateHeader if consolidate header
   * @param consolidateCitations if consolidate citations
   * @return result
   */
  public String processing(String input, boolean consolidateHeader, boolean consolidateCitations) {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), false);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we don't extract images in the PDF file
      if (pathXML == null) {
        throw new GrobidResourceException(
            "PDF parsing fails, " + "because path of where to store xml file is null.");
      }
      doc.setPathXML(pathXML);
      // doc.addFeaturesDocument();

      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      if (citationParser == null) {
        citationParser = new CitationParser();
      }

      String tei =
          doc.toTEI(
              headerParser,
              citationParser,
              consolidateHeader,
              consolidateCitations,
              false,
              null,
              false,
              false);
      resHeader = doc.getResHeader();
      LOGGER.debug(tei);
      return tei;
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }

  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }

  /**
   * Process the full text of the specified pdf and format the result as training data.
   *
   * @param inputFile input file
   * @param pathFullText path to fulltext
   * @param pathTEI path to TEI
   * @param id id
   */
  public void createTrainingFullText(
      String inputFile, String pathFullText, String pathTEI, int id) {
    if (tmpPath == null)
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(inputFile, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      File file = new File(inputFile);
      if (!file.exists()) {
        throw new GrobidResourceException(
            "Cannot train for fulltext, becuase file '"
                + file.getAbsolutePath()
                + "' does not exists.");
      }
      String PDFFileName = file.getName();
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, inputFile, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // pathPDF is the pdf file, tmpPath is the tmp directory for the lxml file,
      // path is the resource path
      // and we don't extract images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);
      ArrayList<String> tokenizations = doc.getTokenizationsFulltext();

      // we write the full text untagged
      String outPathFulltext =
          pathFullText + "/" + PDFFileName.replace(".pdf", ".training.fulltext");
      Writer writer =
          new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8");
      writer.write(fulltext + "\n");
      writer.close();

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();
      StringBuffer bufferFulltext = trainingExtraction(rese, tokenizations);

      // write the TEI file to reflect the extract layout of the text as extracted from the pdf
      writer =
          new OutputStreamWriter(
              new FileOutputStream(
                  new File(
                      pathTEI + "/" + PDFFileName.replace(".pdf", ".training.fulltext.tei.xml")),
                  false),
              "UTF-8");
      writer.write(
          "<?xml version=\"1.0\" ?>\n<tei>\n\t<teiHeader>\n\t\t<fileDesc xml:id=\""
              + id
              + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");

      writer.write(bufferFulltext.toString());
      writer.write("\n\t</text>\n</tei>\n");
      writer.close();

      // output of the identified citations as traning date

      // buffer for the reference block
      StringBuilder allBufferReference = new StringBuilder();
      // we need to rebuild the found citation string as it appears
      String input = "";
      ArrayList<String> inputs = new ArrayList<String>();
      int q = 0;
      st = new StringTokenizer(rese, "\n");
      while (st.hasMoreTokens() && (q < tokenizations.size())) {
        String line = st.nextToken();
        String theTotalTok = tokenizations.get(q);
        String theTok = tokenizations.get(q);
        while (theTok.equals(" ") || theTok.equals("\t") || theTok.equals("\n")) {
          q++;
          theTok = tokenizations.get(q);
          theTotalTok += theTok;
        }
        if (line.endsWith("I-<reference>")) {
          if (input.trim().length() > 1) {
            inputs.add(input.trim());
            input = "";
          }
          input += "\n" + theTotalTok;
        } else if (line.endsWith("<reference>")) {
          input += theTotalTok;
        }
        q++;
      }
      if (input.trim().length() > 1) {
        inputs.add(input.trim());
        if (citationParser == null) {
          citationParser = new CitationParser();
        }
        for (String inpu : inputs) {
          ArrayList<String> inpus = new ArrayList<String>();
          inpus.add(inpu);
          StringBuilder bufferReference = citationParser.trainingExtraction(inpus);
          if (bufferReference != null) {
            allBufferReference.append(bufferReference.toString()).append("\n");
          }
        }
      }

      if (allBufferReference != null) {
        if (allBufferReference.length() > 0) {
          Writer writerReference =
              new OutputStreamWriter(
                  new FileOutputStream(
                      new File(
                          pathTEI + "/" + PDFFileName.replace(".pdf", ".training.references.xml")),
                      false),
                  "UTF-8");
          writerReference.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
          writerReference.write("<citations>\n");

          writerReference.write(allBufferReference.toString());

          writerReference.write("</citations>\n");
          writerReference.close();
        }
      }

    } catch (Exception e) {
      throw new GrobidException(
          "An exception occured while running Grobid training" + " data generation for full text.",
          e);
    } finally {
      doc.cleanLxmlFile(pathXML, true);
    }
  }

  /** Return the Document object of the last processed pdf file. */
  public Document getDoc() {
    return doc;
  }

  /** Return the Biblio object corresponding to the last processed pdf file. */
  public BiblioItem getResHeader() {
    return resHeader;
  }

  /**
   * Extract results from a labelled full text in the training format without any string
   * modification.
   *
   * @param result reult
   * @param tokenizations toks
   * @return extraction
   */
  private StringBuffer trainingExtraction(String result, ArrayList<String> tokenizations) {
    // this is the main buffer for the whole full text
    StringBuffer buffer = new StringBuffer();
    try {
      StringTokenizer st = new StringTokenizer(result, "\n");
      String s1 = null;
      String s2 = null;
      String lastTag = null;

      // current token position
      int p = 0;
      boolean start = true;
      boolean openFigure = false;
      boolean headFigure = false;
      boolean descFigure = false;
      boolean tableBlock = false;

      while (st.hasMoreTokens()) {
        boolean addSpace = false;
        String tok = st.nextToken().trim();

        if (tok.length() == 0) {
          continue;
        }
        StringTokenizer stt = new StringTokenizer(tok, " \t");
        ArrayList<String> localFeatures = new ArrayList<String>();
        int i = 0;

        boolean newLine = false;
        int ll = stt.countTokens();
        while (stt.hasMoreTokens()) {
          String s = stt.nextToken().trim();
          if (i == 0) {
            s2 = TextUtilities.HTMLEncode(s); // lexical token

            boolean strop = false;
            while ((!strop) && (p < tokenizations.size())) {
              String tokOriginal = tokenizations.get(p);
              if (tokOriginal.equals(" ")) {
                addSpace = true;
              } else if (tokOriginal.equals(s)) {
                strop = true;
              }
              p++;
            }
          } else if (i == ll - 1) {
            s1 = s; // current tag
          } else {
            if (s.equals("LINESTART")) newLine = true;
            localFeatures.add(s);
          }
          i++;
        }

        if (newLine && !start) {
          buffer.append("<lb/>");
        }

        String lastTag0 = null;
        if (lastTag != null) {
          if (lastTag.startsWith("I-")) {
            lastTag0 = lastTag.substring(2, lastTag.length());
          } else {
            lastTag0 = lastTag;
          }
        }
        String currentTag0 = null;
        if (s1 != null) {
          if (s1.startsWith("I-")) {
            currentTag0 = s1.substring(2, s1.length());
          } else {
            currentTag0 = s1;
          }
        }

        boolean closeParagraph = false;
        if (lastTag != null) {
          closeParagraph = testClosingTag(buffer, currentTag0, lastTag0, s1);
        }

        boolean output = false;

        if (!currentTag0.equals("<table>")
            && !currentTag0.equals("<trash>")
            && !currentTag0.equals("<figure_head>")
            && !currentTag0.equals("<label>")) {
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
          openFigure = false;
          headFigure = false;
          descFigure = false;
          tableBlock = false;
        }

        output = writeField(buffer, s1, lastTag0, s2, "<header>", "<front>", addSpace, 3);
        if (!output) {
          output =
              writeField(buffer, s1, lastTag0, s2, "<other>", "<note type=\"other\">", addSpace, 3);
        }
        // for paragraph we must distinguish starting and closing tags
        if (!output) {
          if (closeParagraph) {
            output = writeFieldBeginEnd(buffer, s1, "", s2, "<paragraph>", "<p>", addSpace, 3);
          } else {
            output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<paragraph>", "<p>", addSpace, 3);
          }
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_header>",
                  "<note place=\"headnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<page_footnote>",
                  "<note place=\"footnote\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<page>", "<page>", addSpace, 3);
        }
        if (!output) {
          output =
              writeFieldBeginEnd(buffer, s1, lastTag0, s2, "<reference>", "<bibl>", addSpace, 3);
        }
        if (!output) {
          if (closeParagraph) {
            output = writeField(buffer, s1, "", s2, "<reference_marker>", "<label>", addSpace, 3);
          } else
            output =
                writeField(buffer, s1, lastTag0, s2, "<reference_marker>", "<label>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<citation_marker>",
                  "<ref type=\"biblio\">",
                  addSpace,
                  3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<section>", "<head>", addSpace, 3);
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<subsection>", "<head>", addSpace, 3);
        }
        if (!output) {
          if (openFigure) {
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 4);
          } else {
            // output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<figure>\n\t\t\t\t<trash>",
            output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 3);
            if (output) {
              openFigure = true;
            }
          }
        }
        if (!output) {
          output = writeField(buffer, s1, lastTag0, s2, "<equation>", "<formula>", addSpace, 3);
        }
        if (!output) {
          output =
              writeField(
                  buffer,
                  s1,
                  lastTag0,
                  s2,
                  "<figure_marker>",
                  "<ref type=\"figure\">",
                  addSpace,
                  3);
        }
        if (!output) {
          if (openFigure) {
            if (tableBlock && (!lastTag0.equals("<table>")) && (currentTag0.equals("<table>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure>\n\t\t\t\t<table>",
                      "<figure>",
                      addSpace,
                      3);
              if (output) {
                tableBlock = true;
                descFigure = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<table>", "<table>", addSpace, 4);
              if (output) {
                tableBlock = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer, s1, lastTag0, s2, "<table>", "<figure>\n\t\t\t\t<table>", addSpace, 3);
            if (output) {
              openFigure = true;
              tableBlock = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (descFigure && (!lastTag0.equals("<label>")) && (currentTag0.equals("<label>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<label>",
                      "<figure>\n\t\t\t\t<figDesc>",
                      addSpace,
                      3);
              if (output) {
                descFigure = true;
                tableBlock = false;
                headFigure = false;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<label>", "<figDesc>", addSpace, 4);
              if (output) {
                descFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<label>",
                    "<figure>\n\t\t\t\t<figDesc>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              descFigure = true;
            }
          }
        }
        if (!output) {
          if (openFigure) {
            if (headFigure
                && (!lastTag0.equals("<figure_head>"))
                && (currentTag0.equals("<figure_head>"))) {
              buffer.append("\n\t\t\t</figure>\n\n");
              output =
                  writeField(
                      buffer,
                      s1,
                      lastTag0,
                      s2,
                      "<figure_head>",
                      "<figure>\n\t\t\t\t<head>",
                      addSpace,
                      3);
              if (output) {
                descFigure = false;
                tableBlock = false;
                headFigure = true;
              }
            } else {
              output = writeField(buffer, s1, lastTag0, s2, "<figure_head>", "<head>", addSpace, 4);
              if (output) {
                headFigure = true;
              }
            }
          } else {
            output =
                writeField(
                    buffer,
                    s1,
                    lastTag0,
                    s2,
                    "<figure_head>",
                    "<figure>\n\t\t\t\t<head>",
                    addSpace,
                    3);
            if (output) {
              openFigure = true;
              headFigure = true;
            }
          }
        }
        // for item we must distinguish starting and closing tags
        if (!output) {
          output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<item>", "<item>", addSpace, 3);
        }

        lastTag = s1;

        if (!st.hasMoreTokens()) {
          if (lastTag != null) {
            testClosingTag(buffer, "", currentTag0, s1);
          }
          if (openFigure) {
            buffer.append("\n\t\t\t</figure>\n\n");
          }
        }
        if (start) {
          start = false;
        }
      }

      return buffer;
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }

  /**
   * TODO some documentation...
   *
   * @param buffer
   * @param s1
   * @param lastTag0
   * @param s2
   * @param field
   * @param outField
   * @param addSpace
   * @param nbIndent
   * @return
   */
  private boolean writeField(
      StringBuffer buffer,
      String s1,
      String lastTag0,
      String s2,
      String field,
      String outField,
      boolean addSpace,
      int nbIndent) {
    boolean result = false;
    if ((s1.equals(field)) || (s1.equals("I-" + field))) {
      result = true;
      if (s1.equals(lastTag0) || s1.equals("I-" + lastTag0)) {
        if (addSpace) buffer.append(" " + s2);
        else buffer.append(s2);
      }
      /*else if (lastTag0 == null) {
          for(int i=0; i<nbIndent; i++) {
              buffer.append("\t");
          }
            buffer.append(outField+s2);
      }*/
      else if (field.equals("<citation_marker>")) {
        if (addSpace) buffer.append(" " + outField + s2);
        else buffer.append(outField + s2);
      } else if (field.equals("<figure_marker>")) {
        if (addSpace) buffer.append(" " + outField + s2);
        else buffer.append(outField + s2);
      } else if (field.equals("<reference_marker>")) {
        if (!lastTag0.equals("<reference>") && !lastTag0.equals("<reference_marker>")) {
          for (int i = 0; i < nbIndent; i++) {
            buffer.append("\t");
          }
          buffer.append("<bibl>");
        }
        if (addSpace) buffer.append(" " + outField + s2);
        else buffer.append(outField + s2);
      } else if (lastTag0 == null) {
        for (int i = 0; i < nbIndent; i++) {
          buffer.append("\t");
        }
        buffer.append(outField + s2);
      } else if (!lastTag0.equals("<citation_marker>")
          && !lastTag0.equals("<figure_marker>")
          && !lastTag0.equals("<figure>")) {
        for (int i = 0; i < nbIndent; i++) {
          buffer.append("\t");
        }
        buffer.append(outField + s2);
      } else {
        if (addSpace) buffer.append(" " + s2);
        else buffer.append(s2);
      }
    }
    return result;
  }

  /**
   * This is for writing fields for fields where begin and end of field matter, like paragraph or
   * item
   *
   * @param buffer
   * @param s1
   * @param lastTag0
   * @param s2
   * @param field
   * @param outField
   * @param addSpace
   * @param nbIndent
   * @return
   */
  private boolean writeFieldBeginEnd(
      StringBuffer buffer,
      String s1,
      String lastTag0,
      String s2,
      String field,
      String outField,
      boolean addSpace,
      int nbIndent) {
    boolean result = false;
    if ((s1.equals(field)) || (s1.equals("I-" + field))) {
      result = true;
      if (lastTag0.equals("I-" + field)) {
        if (addSpace) buffer.append(" " + s2);
        else buffer.append(s2);
      } else if (lastTag0.equals(field) && s1.equals(field)) {
        if (addSpace) buffer.append(" " + s2);
        else buffer.append(s2);
      } else if (!lastTag0.equals("<citation_marker>")
          && !lastTag0.equals("<figure_marker>")
          && !lastTag0.equals("<figure>")
          && !lastTag0.equals("<reference_marker>")) {
        for (int i = 0; i < nbIndent; i++) {
          buffer.append("\t");
        }
        buffer.append(outField + s2);
      } else {
        if (addSpace) buffer.append(" " + s2);
        else buffer.append(s2);
      }
    }
    return result;
  }

  /**
   * TODO some documentation
   *
   * @param buffer
   * @param currentTag0
   * @param lastTag0
   * @param currentTag
   * @return
   */
  private boolean testClosingTag(
      StringBuffer buffer, String currentTag0, String lastTag0, String currentTag) {
    boolean res = false;
    // reference_marker and citation_marker are two exceptions because they can be embedded

    if (!currentTag0.equals(lastTag0)
        || currentTag.equals("I-<paragraph>")
        || currentTag.equals("I-<item>")) {
      if (currentTag0.equals("<citation_marker>") || currentTag0.equals("<figure_marker>")) {
        return res;
      }

      res = false;
      // we close the current tag
      if (lastTag0.equals("<other>")) {
        buffer.append("</note>\n\n");
      } else if (lastTag0.equals("<header>")) {
        buffer.append("</front>\n\n");
      } else if (lastTag0.equals("<page_header>")) {
        buffer.append("</note>\n\n");
      } else if (lastTag0.equals("<page_footnote>")) {
        buffer.append("</note>\n\n");
      } else if (lastTag0.equals("<reference>")) {
        buffer.append("</bibl>\n\n");
        res = true;
      } else if (lastTag0.equals("<paragraph>")) {
        buffer.append("</p>\n\n");
        res = true;
      } else if (lastTag0.equals("<section>")) {
        buffer.append("</head>\n\n");
      } else if (lastTag0.equals("<subsection>")) {
        buffer.append("</head>\n\n");
      } else if (lastTag0.equals("<equation>")) {
        buffer.append("</formula>\n\n");
      } else if (lastTag0.equals("<table>")) {
        buffer.append("</table>\n");
      } else if (lastTag0.equals("<label>")) {
        buffer.append("</figDesc>\n");
      } else if (lastTag0.equals("<figure_head>")) {
        buffer.append("</head>\n\n");
      } else if (lastTag0.equals("<item>")) {
        buffer.append("</item>\n\n");
      } else if (lastTag0.equals("<trash>")) {
        buffer.append("</trash>\n\n");
      } else if (lastTag0.equals("<reference_marker>")) {
        buffer.append("</label>");
      } else if (lastTag0.equals("<citation_marker>")) {
        buffer.append("</ref>");
      } else if (lastTag0.equals("<figure_marker>")) {
        buffer.append("</ref>");
      } else if (lastTag0.equals("<page>")) {
        buffer.append("</page>\n\n");
      } else {
        res = false;
      }
    }
    return res;
  }

  /**
   * Create the TEI representation for a document based on the parsed header, references and body
   * sections.
   */
  private String toTEI(
      Document doc,
      String rese,
      ArrayList<String> tokenizations,
      BiblioItem resHeader,
      boolean peer,
      BiblioItem catalogue,
      boolean withStyleSheet) {
    if (doc.blocks == null) {
      return null;
    }
    TEIFormater teiFormater = new TEIFormater(doc);
    StringBuffer tei;
    try {
      tei = teiFormater.toTEIHeader(resHeader, peer, withStyleSheet, null);
      tei = teiFormater.toTEIBodyML(tei, rese, resHeader, doc.bibDataSets, tokenizations, doc);
      tei = teiFormater.toTEIReferences(tei, doc.bibDataSets);

      tei.append("\t\t</back>\n");
      tei.append("\t</text>\n");
      tei.append("</TEI>\n");
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
    return tei.toString();
  }

  @Override
  public void close() throws IOException {
    super.close();
    headerParser.close();
    headerParser = null;
    citationParser.close();
    citationParser = null;
  }
}
コード例 #2
0
ファイル: FullTextParser.java プロジェクト: snpts/grobid
  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }