@Override public void close() throws IOException { super.close(); headerParser.close(); headerParser = null; citationParser.close(); citationParser = null; }
public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations) throws Exception { if (input == null) { throw new GrobidResourceException("Cannot process pdf file, because input file was null."); } File inputFile = new File(input); if (!inputFile.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because input file '" + inputFile.getAbsolutePath() + "' does not exists."); } if (tmpPath == null) { throw new GrobidResourceException("Cannot process pdf file, because temp path is null."); } if (!tmpPath.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because temp path '" + tmpPath.getAbsolutePath() + "' does not exists."); } doc = new Document(input, tmpPath.getAbsolutePath()); try { int startPage = -1; int endPage = -1; pathXML = doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true); // with timeout, // no force pdf reloading // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file, // path is the resource path // and we process images in the pdf file if (pathXML == null) { throw new Exception("PDF parsing fails"); } doc.setPathXML(pathXML); ArrayList<String> tokenizations = doc.addFeaturesDocument(); if (doc.getBlocks() == null) { throw new Exception("PDF parsing resulted in empty content"); } String fulltext = doc.getFulltextFeatured(true, true); StringTokenizer st = new StringTokenizer(fulltext, "\n"); feedTaggerAndParse(st); StringBuilder res = new StringBuilder(); for (int i = 0; i < tagger.size(); i++) { for (int j = 0; j < tagger.xsize(); j++) { res.append(tagger.x(i, j)).append("\t"); } res.append(tagger.y2(i)); res.append("\n"); } // buffer for the fulltext block String rese = res.toString(); // set the different sections of the Document object doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations); // header processing if (headerParser == null) { headerParser = new HeaderParser(); } resHeader = new BiblioItem(); headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader); // the language identification is normally done during the header parsing, but only // based on header information. // LanguageUtilities languageUtilities = LanguageUtilities.getInstance(); Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody()); if (langu != null) { String lang = langu.getLangId(); doc.setLanguage(lang); resHeader.setLanguage(lang); } // citation processing if (citationParser == null) { citationParser = new CitationParser(); } ArrayList<BibDataSet> resCitations; // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences(); // System.out.println(tokenizationsRef.toString()); // resCitations = BasicStructureBuilder.giveReferenceSegments(doc); resCitations = doc.bibDataSets; if (resCitations != null) { for (BibDataSet bds : resCitations) { String marker = bds.getRefSymbol(); if (marker != null) { marker = marker.replace(".", ""); marker = marker.replace(" ", ""); bds.setRefSymbol(marker); } BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations); bds.setResBib(bib); } } // final combination return toTEI(doc, rese, tokenizations, resHeader, false, null, false); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } finally { // keep it clean when leaving... doc.cleanLxmlFile(pathXML, false); } }