Java HeaderParser Examples

Programming Language: Java

Namespace/Package Name: java.io

Class/Type: HeaderParser

Examples at hotexamples.com: 2

Java HeaderParser - 2 examples found. These are the top rated real world Java examples of java.io.HeaderParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

close(1)

processingHeaderBlock(1)

Example #1

Show file

File: FullTextParser.java Project: snpts/grobid

 @Override
 public void close() throws IOException {
   super.close();
   headerParser.close();
   headerParser = null;
   citationParser.close();
   citationParser = null;
 }

Example #2

Show file

File: FullTextParser.java Project: snpts/grobid

  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }