예제 #1
0
 @Override
 public void close() throws IOException {
   super.close();
   headerParser.close();
   headerParser = null;
   citationParser.close();
   citationParser = null;
 }
예제 #2
0
  /**
   * Process the full text of the specified pdf and format the result as training data.
   *
   * @param inputFile input file
   * @param pathFullText path to fulltext
   * @param pathTEI path to TEI
   * @param id id
   */
  public void createTrainingFullText(
      String inputFile, String pathFullText, String pathTEI, int id) {
    if (tmpPath == null)
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(inputFile, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      File file = new File(inputFile);
      if (!file.exists()) {
        throw new GrobidResourceException(
            "Cannot train for fulltext, becuase file '"
                + file.getAbsolutePath()
                + "' does not exists.");
      }
      String PDFFileName = file.getName();
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, inputFile, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // pathPDF is the pdf file, tmpPath is the tmp directory for the lxml file,
      // path is the resource path
      // and we don't extract images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);
      ArrayList<String> tokenizations = doc.getTokenizationsFulltext();

      // we write the full text untagged
      String outPathFulltext =
          pathFullText + "/" + PDFFileName.replace(".pdf", ".training.fulltext");
      Writer writer =
          new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8");
      writer.write(fulltext + "\n");
      writer.close();

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();
      StringBuffer bufferFulltext = trainingExtraction(rese, tokenizations);

      // write the TEI file to reflect the extract layout of the text as extracted from the pdf
      writer =
          new OutputStreamWriter(
              new FileOutputStream(
                  new File(
                      pathTEI + "/" + PDFFileName.replace(".pdf", ".training.fulltext.tei.xml")),
                  false),
              "UTF-8");
      writer.write(
          "<?xml version=\"1.0\" ?>\n<tei>\n\t<teiHeader>\n\t\t<fileDesc xml:id=\""
              + id
              + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");

      writer.write(bufferFulltext.toString());
      writer.write("\n\t</text>\n</tei>\n");
      writer.close();

      // output of the identified citations as traning date

      // buffer for the reference block
      StringBuilder allBufferReference = new StringBuilder();
      // we need to rebuild the found citation string as it appears
      String input = "";
      ArrayList<String> inputs = new ArrayList<String>();
      int q = 0;
      st = new StringTokenizer(rese, "\n");
      while (st.hasMoreTokens() && (q < tokenizations.size())) {
        String line = st.nextToken();
        String theTotalTok = tokenizations.get(q);
        String theTok = tokenizations.get(q);
        while (theTok.equals(" ") || theTok.equals("\t") || theTok.equals("\n")) {
          q++;
          theTok = tokenizations.get(q);
          theTotalTok += theTok;
        }
        if (line.endsWith("I-<reference>")) {
          if (input.trim().length() > 1) {
            inputs.add(input.trim());
            input = "";
          }
          input += "\n" + theTotalTok;
        } else if (line.endsWith("<reference>")) {
          input += theTotalTok;
        }
        q++;
      }
      if (input.trim().length() > 1) {
        inputs.add(input.trim());
        if (citationParser == null) {
          citationParser = new CitationParser();
        }
        for (String inpu : inputs) {
          ArrayList<String> inpus = new ArrayList<String>();
          inpus.add(inpu);
          StringBuilder bufferReference = citationParser.trainingExtraction(inpus);
          if (bufferReference != null) {
            allBufferReference.append(bufferReference.toString()).append("\n");
          }
        }
      }

      if (allBufferReference != null) {
        if (allBufferReference.length() > 0) {
          Writer writerReference =
              new OutputStreamWriter(
                  new FileOutputStream(
                      new File(
                          pathTEI + "/" + PDFFileName.replace(".pdf", ".training.references.xml")),
                      false),
                  "UTF-8");
          writerReference.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
          writerReference.write("<citations>\n");

          writerReference.write(allBufferReference.toString());

          writerReference.write("</citations>\n");
          writerReference.close();
        }
      }

    } catch (Exception e) {
      throw new GrobidException(
          "An exception occured while running Grobid training" + " data generation for full text.",
          e);
    } finally {
      doc.cleanLxmlFile(pathXML, true);
    }
  }
예제 #3
0
  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }