@Override public void close() throws IOException { super.close(); headerParser.close(); headerParser = null; citationParser.close(); citationParser = null; }
/** * Process the full text of the specified pdf and format the result as training data. * * @param inputFile input file * @param pathFullText path to fulltext * @param pathTEI path to TEI * @param id id */ public void createTrainingFullText( String inputFile, String pathFullText, String pathTEI, int id) { if (tmpPath == null) throw new GrobidResourceException("Cannot process pdf file, because temp path is null."); if (!tmpPath.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because temp path '" + tmpPath.getAbsolutePath() + "' does not exists."); } doc = new Document(inputFile, tmpPath.getAbsolutePath()); try { int startPage = -1; int endPage = -1; File file = new File(inputFile); if (!file.exists()) { throw new GrobidResourceException( "Cannot train for fulltext, becuase file '" + file.getAbsolutePath() + "' does not exists."); } String PDFFileName = file.getName(); pathXML = doc.pdf2xml(true, false, startPage, endPage, inputFile, tmpPath.getAbsolutePath(), true); // with timeout, // no force pdf reloading // pathPDF is the pdf file, tmpPath is the tmp directory for the lxml file, // path is the resource path // and we don't extract images in the pdf file if (pathXML == null) { throw new Exception("PDF parsing fails"); } doc.setPathXML(pathXML); doc.addFeaturesDocument(); if (doc.getBlocks() == null) { throw new Exception("PDF parsing resulted in empty content"); } String fulltext = doc.getFulltextFeatured(true, true); ArrayList<String> tokenizations = doc.getTokenizationsFulltext(); // we write the full text untagged String outPathFulltext = pathFullText + "/" + PDFFileName.replace(".pdf", ".training.fulltext"); Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8"); writer.write(fulltext + "\n"); writer.close(); StringTokenizer st = new StringTokenizer(fulltext, "\n"); feedTaggerAndParse(st); StringBuilder res = new StringBuilder(); for (int i = 0; i < tagger.size(); i++) { for (int j = 0; j < tagger.xsize(); j++) { res.append(tagger.x(i, j)).append("\t"); } res.append(tagger.y2(i)); res.append("\n"); } // buffer for the fulltext block String rese = res.toString(); StringBuffer bufferFulltext = trainingExtraction(rese, tokenizations); // write the TEI file to reflect the extract layout of the text as extracted from the pdf writer = new OutputStreamWriter( new FileOutputStream( new File( pathTEI + "/" + PDFFileName.replace(".pdf", ".training.fulltext.tei.xml")), false), "UTF-8"); writer.write( "<?xml version=\"1.0\" ?>\n<tei>\n\t<teiHeader>\n\t\t<fileDesc xml:id=\"" + id + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n"); writer.write(bufferFulltext.toString()); writer.write("\n\t</text>\n</tei>\n"); writer.close(); // output of the identified citations as traning date // buffer for the reference block StringBuilder allBufferReference = new StringBuilder(); // we need to rebuild the found citation string as it appears String input = ""; ArrayList<String> inputs = new ArrayList<String>(); int q = 0; st = new StringTokenizer(rese, "\n"); while (st.hasMoreTokens() && (q < tokenizations.size())) { String line = st.nextToken(); String theTotalTok = tokenizations.get(q); String theTok = tokenizations.get(q); while (theTok.equals(" ") || theTok.equals("\t") || theTok.equals("\n")) { q++; theTok = tokenizations.get(q); theTotalTok += theTok; } if (line.endsWith("I-<reference>")) { if (input.trim().length() > 1) { inputs.add(input.trim()); input = ""; } input += "\n" + theTotalTok; } else if (line.endsWith("<reference>")) { input += theTotalTok; } q++; } if (input.trim().length() > 1) { inputs.add(input.trim()); if (citationParser == null) { citationParser = new CitationParser(); } for (String inpu : inputs) { ArrayList<String> inpus = new ArrayList<String>(); inpus.add(inpu); StringBuilder bufferReference = citationParser.trainingExtraction(inpus); if (bufferReference != null) { allBufferReference.append(bufferReference.toString()).append("\n"); } } } if (allBufferReference != null) { if (allBufferReference.length() > 0) { Writer writerReference = new OutputStreamWriter( new FileOutputStream( new File( pathTEI + "/" + PDFFileName.replace(".pdf", ".training.references.xml")), false), "UTF-8"); writerReference.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); writerReference.write("<citations>\n"); writerReference.write(allBufferReference.toString()); writerReference.write("</citations>\n"); writerReference.close(); } } } catch (Exception e) { throw new GrobidException( "An exception occured while running Grobid training" + " data generation for full text.", e); } finally { doc.cleanLxmlFile(pathXML, true); } }
public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations) throws Exception { if (input == null) { throw new GrobidResourceException("Cannot process pdf file, because input file was null."); } File inputFile = new File(input); if (!inputFile.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because input file '" + inputFile.getAbsolutePath() + "' does not exists."); } if (tmpPath == null) { throw new GrobidResourceException("Cannot process pdf file, because temp path is null."); } if (!tmpPath.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because temp path '" + tmpPath.getAbsolutePath() + "' does not exists."); } doc = new Document(input, tmpPath.getAbsolutePath()); try { int startPage = -1; int endPage = -1; pathXML = doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true); // with timeout, // no force pdf reloading // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file, // path is the resource path // and we process images in the pdf file if (pathXML == null) { throw new Exception("PDF parsing fails"); } doc.setPathXML(pathXML); ArrayList<String> tokenizations = doc.addFeaturesDocument(); if (doc.getBlocks() == null) { throw new Exception("PDF parsing resulted in empty content"); } String fulltext = doc.getFulltextFeatured(true, true); StringTokenizer st = new StringTokenizer(fulltext, "\n"); feedTaggerAndParse(st); StringBuilder res = new StringBuilder(); for (int i = 0; i < tagger.size(); i++) { for (int j = 0; j < tagger.xsize(); j++) { res.append(tagger.x(i, j)).append("\t"); } res.append(tagger.y2(i)); res.append("\n"); } // buffer for the fulltext block String rese = res.toString(); // set the different sections of the Document object doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations); // header processing if (headerParser == null) { headerParser = new HeaderParser(); } resHeader = new BiblioItem(); headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader); // the language identification is normally done during the header parsing, but only // based on header information. // LanguageUtilities languageUtilities = LanguageUtilities.getInstance(); Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody()); if (langu != null) { String lang = langu.getLangId(); doc.setLanguage(lang); resHeader.setLanguage(lang); } // citation processing if (citationParser == null) { citationParser = new CitationParser(); } ArrayList<BibDataSet> resCitations; // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences(); // System.out.println(tokenizationsRef.toString()); // resCitations = BasicStructureBuilder.giveReferenceSegments(doc); resCitations = doc.bibDataSets; if (resCitations != null) { for (BibDataSet bds : resCitations) { String marker = bds.getRefSymbol(); if (marker != null) { marker = marker.replace(".", ""); marker = marker.replace(" ", ""); bds.setRefSymbol(marker); } BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations); bds.setResBib(bib); } } // final combination return toTEI(doc, rese, tokenizations, resHeader, false, null, false); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } finally { // keep it clean when leaving... doc.cleanLxmlFile(pathXML, false); } }