Java CitationParser примеры использования

Язык программирования: Java

Класс/Тип: CitationParser

Примеров на hotexamples.com: 3

Java CitationParser - 3 примера найдено. Это лучшие примеры Java кода для CitationParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

close(1)

processing(1)

trainingExtraction(1)

Пример #1

Показать файл

Файл: FullTextParser.java Проект: snpts/grobid

 @Override
 public void close() throws IOException {
   super.close();
   headerParser.close();
   headerParser = null;
   citationParser.close();
   citationParser = null;
 }

Пример #2

Показать файл

Файл: FullTextParser.java Проект: snpts/grobid

  /**
   * Process the full text of the specified pdf and format the result as training data.
   *
   * @param inputFile input file
   * @param pathFullText path to fulltext
   * @param pathTEI path to TEI
   * @param id id
   */
  public void createTrainingFullText(
      String inputFile, String pathFullText, String pathTEI, int id) {
    if (tmpPath == null)
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(inputFile, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      File file = new File(inputFile);
      if (!file.exists()) {
        throw new GrobidResourceException(
            "Cannot train for fulltext, becuase file '"
                + file.getAbsolutePath()
                + "' does not exists.");
      }
      String PDFFileName = file.getName();
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, inputFile, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // pathPDF is the pdf file, tmpPath is the tmp directory for the lxml file,
      // path is the resource path
      // and we don't extract images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);
      ArrayList<String> tokenizations = doc.getTokenizationsFulltext();

      // we write the full text untagged
      String outPathFulltext =
          pathFullText + "/" + PDFFileName.replace(".pdf", ".training.fulltext");
      Writer writer =
          new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8");
      writer.write(fulltext + "\n");
      writer.close();

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();
      StringBuffer bufferFulltext = trainingExtraction(rese, tokenizations);

      // write the TEI file to reflect the extract layout of the text as extracted from the pdf
      writer =
          new OutputStreamWriter(
              new FileOutputStream(
                  new File(
                      pathTEI + "/" + PDFFileName.replace(".pdf", ".training.fulltext.tei.xml")),
                  false),
              "UTF-8");
      writer.write(
          "<?xml version=\"1.0\" ?>\n<tei>\n\t<teiHeader>\n\t\t<fileDesc xml:id=\""
              + id
              + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");

      writer.write(bufferFulltext.toString());
      writer.write("\n\t</text>\n</tei>\n");
      writer.close();

      // output of the identified citations as traning date

      // buffer for the reference block
      StringBuilder allBufferReference = new StringBuilder();
      // we need to rebuild the found citation string as it appears
      String input = "";
      ArrayList<String> inputs = new ArrayList<String>();
      int q = 0;
      st = new StringTokenizer(rese, "\n");
      while (st.hasMoreTokens() && (q < tokenizations.size())) {
        String line = st.nextToken();
        String theTotalTok = tokenizations.get(q);
        String theTok = tokenizations.get(q);
        while (theTok.equals(" ") || theTok.equals("\t") || theTok.equals("\n")) {
          q++;
          theTok = tokenizations.get(q);
          theTotalTok += theTok;
        }
        if (line.endsWith("I-<reference>")) {
          if (input.trim().length() > 1) {
            inputs.add(input.trim());
            input = "";
          }
          input += "\n" + theTotalTok;
        } else if (line.endsWith("<reference>")) {
          input += theTotalTok;
        }
        q++;
      }
      if (input.trim().length() > 1) {
        inputs.add(input.trim());
        if (citationParser == null) {
          citationParser = new CitationParser();
        }
        for (String inpu : inputs) {
          ArrayList<String> inpus = new ArrayList<String>();
          inpus.add(inpu);
          StringBuilder bufferReference = citationParser.trainingExtraction(inpus);
          if (bufferReference != null) {
            allBufferReference.append(bufferReference.toString()).append("\n");
          }
        }
      }

      if (allBufferReference != null) {
        if (allBufferReference.length() > 0) {
          Writer writerReference =
              new OutputStreamWriter(
                  new FileOutputStream(
                      new File(
                          pathTEI + "/" + PDFFileName.replace(".pdf", ".training.references.xml")),
                      false),
                  "UTF-8");
          writerReference.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
          writerReference.write("<citations>\n");

          writerReference.write(allBufferReference.toString());

          writerReference.write("</citations>\n");
          writerReference.close();
        }
      }

    } catch (Exception e) {
      throw new GrobidException(
          "An exception occured while running Grobid training" + " data generation for full text.",
          e);
    } finally {
      doc.cleanLxmlFile(pathXML, true);
    }
  }

Пример #3

Показать файл

Файл: FullTextParser.java Проект: snpts/grobid

  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }