public int accept(RxDocument rdoc) { // File corpusPath = (File) rdoc.getScope("document").get("corpus.path"); File targetRootDirectory = (File) rdoc.getScope("session").get("target.root.directory"); File statusFile0 = FileUtils.generateCorpusResourceFile(targetRootDirectory, corpusPath, ".tag.0"); File statusFile1 = FileUtils.generateCorpusResourceFile(targetRootDirectory, corpusPath, ".tag.1"); String corpusBaseFilename = FileUtils.getCorpusFilename(statusFile0); boolean doReprocess = ((Boolean) rdoc.getScope("session").get("reprocess.boolean")).booleanValue(); // APD if (!doReprocess) { if (statusFile0.exists() || statusFile1.exists()) { getLogger(rdoc).warn("skipping '" + corpusBaseFilename + "': already tagged"); // APD report an error message // rdoc.docErrorString("Already tagged"); rdoc.docInfoString("Already tagged"); return ReturnCode.ABORT_PAPER; } } if (!statusFile0.exists()) { statusFile0.getParentFile().mkdirs(); try { if (!statusFile0.createNewFile()) { getLogger(rdoc) .error("aborting '" + corpusBaseFilename + "'; couldn't create status file"); rdoc.docInfoString("STATUS FILE CREATION ERROR"); return ReturnCode.ABORT_PAPER; } } catch (IOException e) { getLogger(rdoc).error("aborting '" + corpusBaseFilename + "'; " + e.getMessage()); rdoc.docInfoString("STATUS FILE CREATION ERROR2"); return ReturnCode.ABORT_PAPER; } } // TODO : need to grab a file lock for this to be safe incrementIntAttribute(rdoc, "session", "metric.documents.attempted.integer"); // System.out.println( corpusPath.getName() + "..." ); // APD System.out.println("Processing file: " + corpusPath.toString() + " ..."); return ReturnCode.OK; }
public int accept(RxDocument rdoc) { Iterator i; // Get the document's error list LinkedList docErrors = (LinkedList) rdoc.getScope("document").get("error.list"); // No error strings registered for this document if (docErrors == null) { /* docErrors = new LinkedList(); docErrors.add("unreported error"); */ return ReturnCode.OK; } URI fileURI = (URI) rdoc.getScope("document").get("corpus.absolute.uri"); URI relURI = (URI) rdoc.getScope("document").get("corpus.relative.uri"); // Get the relative path String tmp = relURI.getPath(); // Chop off the final '.xml' tmp = tmp.substring(0, tmp.length() - 4); // Extract the original URL URL u; File f; try { f = new File(tmp); u = URLMangler.demangle(f); } catch (MalformedURLException e) { getLogger(rdoc).error("malformedURL: " + e); return ReturnCode.OK; } String xmllink = "<A href=\"" + fileURI.toString() + "\">" + fileURI.getPath() + "</A>"; String pslink; // Hack -- add a hint about the file type for .ps.gz files. This // is useful for getting mozilla to fire up ghostview properly. Mileage // with other browsers may vary. if (tmp.endsWith(".ps.gz")) { pslink = "<A href=\"" + u.toString() + "\" type=application/postscript>" + u.toString() + "</A>"; } else { pslink = "<A href=\"" + u.toString() + "\">" + u.toString() + "</A>"; } for (i = docErrors.iterator(); i.hasNext(); ) { String errorType = (String) i.next(); String logFileName = errorType; String logFileExt = ".err.html"; // if the first error of this type we've seen... if (!_errorTypes.contains(errorType)) { _errorTypes.add(errorType); clearLogFile(rdoc, logFileName, logFileExt); htmlHeader(rdoc, logFileName, logFileExt); } appendLogFile(rdoc, logFileName, logFileExt, pslink + "<br>"); appendLogFile(rdoc, logFileName, logFileExt, xmllink + "<br><br>"); } return ReturnCode.OK; }
/** @param rdoc */ private boolean doExtraction(RxDocument rdoc) { NewHtmlTokenization tokenization = rdoc.getTokenization(); ArrayList referenceElements = new ArrayList(); Map segmentations = (Map) rdoc.getScope("document").get("segmentation"); if (tokenization == null) { getLogger(rdoc).error("Partitioner found nothing to partition..."); rdoc.docErrorString("Partitioner found nothing to partition"); return false; } // Markup header if (_headersExtractor != null) { NewHtmlTokenization header = (NewHtmlTokenization) segmentations.get("headerTokenization"); if (header != null) { log.info("running crf on header"); if (header.clearTokenFeatures()) { log.warn("header tokens had features set before crf extraction"); } Extraction extraction = _headersExtractor.extract(header); // log.info("done."); Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels(); CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter(); Element element = crfOutputFormatter.toXmlElement(header, predictedLabels, "headers"); // Get the first token in the document PropertyHolder firstHeaderToken = header.getToken(0); // Get the token's position double llx = firstHeaderToken.getNumericProperty("llx"); double lly = firstHeaderToken.getNumericProperty("lly"); int pageNum = (int) firstHeaderToken.getNumericProperty("pageNum"); String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly; element.setAttribute("headerID", persistentMentionID); segmentations.put("headerElement", element); } } // Markup references if (_referencesExtractor != null) { List refList = (List) segmentations.get("referenceList"); if (refList == null) { getLogger(rdoc).error("no biblio to extract"); rdoc.docErrorString("no biblio to extract"); return false; } Iterator referenceIterator = refList.iterator(); // For outputing full file paths in the reference warnings int refNum = 1; while (referenceIterator.hasNext()) { // Extract reference NewHtmlTokenization reference = (NewHtmlTokenization) referenceIterator.next(); if (reference.clearTokenFeatures()) { log.warn("reference tokens had features set before crf extraction"); } log.info("running crf on reference " + refNum + " of " + refList.size()); Extraction extraction = _referencesExtractor.extract(reference); Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels(); // Check extracted reference for validity String warning = checkReference(reference, predictedLabels); if (!warning.equals("")) { log.error("Suspicous reference (" + refNum + "):" + warning); } CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter(); Element element = crfOutputFormatter.toXmlElement(reference, predictedLabels, "reference"); // Get the first token in the reference PropertyHolder firstRefToken = (PropertyHolder) reference.getToken(0); // Get the token's position double llx = firstRefToken.getNumericProperty("llx"); double lly = firstRefToken.getNumericProperty("lly"); int pageNum = (int) firstRefToken.getNumericProperty("pageNum"); String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly; element.setAttribute("refID", persistentMentionID); referenceElements.add(element); refNum++; } } segmentations.put("referenceElements", referenceElements); return true; }