// Analyze the predicted labels for a reference in an attempt to find improperly segmented // (i.e. too long) references. If a suspect reference is found, return a truncated // NewHtmlTokenization; // otherwise return null. // private NewHtmlTokenization fixReference(NewHtmlTokenization tokens, Sequence predictedTags) { private String checkReference(NewHtmlTokenization tokens, Sequence predictedTags) { assert tokens.size() == predictedTags.size(); boolean seenMarker = false; boolean seenAuthors = false; boolean seenTitle = false; String warning = ""; String previousTag = ""; for (int i = 0; i < predictedTags.size(); i++) { String tag = predictedTags.get(i).toString(); boolean truncateHere = false; if (previousTag.startsWith("ref-marker") && !tag.startsWith("ref-marker")) { seenMarker = true; } if (previousTag.startsWith("author") && !tag.startsWith("author")) { seenAuthors = true; } if (previousTag.startsWith("title") && !tag.startsWith("title")) { seenTitle = true; } // FIXME: should we really truncate on duplicate ref-marker if we haven't seen authors or // title yet? boolean newMarker = (tag.startsWith("ref-marker") && !previousTag.startsWith("ref-marker")); if ((seenMarker || seenAuthors || seenTitle) && newMarker) { truncateHere = true; warning = warning + "duplicate ref-marker;"; } boolean newAuthor = (tag.startsWith("author") && !previousTag.startsWith("author")); if (seenAuthors && newAuthor) { truncateHere = true; warning = warning + "duplicate authors;"; } boolean newTitle = (tag.startsWith("title") && !previousTag.startsWith("title")); if (seenTitle && newTitle) { truncateHere = true; warning = warning + "duplicate title;"; } previousTag = tag; } return warning; }
/** @param rdoc */ private boolean doExtraction(RxDocument rdoc) { NewHtmlTokenization tokenization = rdoc.getTokenization(); ArrayList referenceElements = new ArrayList(); Map segmentations = (Map) rdoc.getScope("document").get("segmentation"); if (tokenization == null) { getLogger(rdoc).error("Partitioner found nothing to partition..."); rdoc.docErrorString("Partitioner found nothing to partition"); return false; } // Markup header if (_headersExtractor != null) { NewHtmlTokenization header = (NewHtmlTokenization) segmentations.get("headerTokenization"); if (header != null) { log.info("running crf on header"); if (header.clearTokenFeatures()) { log.warn("header tokens had features set before crf extraction"); } Extraction extraction = _headersExtractor.extract(header); // log.info("done."); Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels(); CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter(); Element element = crfOutputFormatter.toXmlElement(header, predictedLabels, "headers"); // Get the first token in the document PropertyHolder firstHeaderToken = header.getToken(0); // Get the token's position double llx = firstHeaderToken.getNumericProperty("llx"); double lly = firstHeaderToken.getNumericProperty("lly"); int pageNum = (int) firstHeaderToken.getNumericProperty("pageNum"); String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly; element.setAttribute("headerID", persistentMentionID); segmentations.put("headerElement", element); } } // Markup references if (_referencesExtractor != null) { List refList = (List) segmentations.get("referenceList"); if (refList == null) { getLogger(rdoc).error("no biblio to extract"); rdoc.docErrorString("no biblio to extract"); return false; } Iterator referenceIterator = refList.iterator(); // For outputing full file paths in the reference warnings int refNum = 1; while (referenceIterator.hasNext()) { // Extract reference NewHtmlTokenization reference = (NewHtmlTokenization) referenceIterator.next(); if (reference.clearTokenFeatures()) { log.warn("reference tokens had features set before crf extraction"); } log.info("running crf on reference " + refNum + " of " + refList.size()); Extraction extraction = _referencesExtractor.extract(reference); Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels(); // Check extracted reference for validity String warning = checkReference(reference, predictedLabels); if (!warning.equals("")) { log.error("Suspicous reference (" + refNum + "):" + warning); } CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter(); Element element = crfOutputFormatter.toXmlElement(reference, predictedLabels, "reference"); // Get the first token in the reference PropertyHolder firstRefToken = (PropertyHolder) reference.getToken(0); // Get the token's position double llx = firstRefToken.getNumericProperty("llx"); double lly = firstRefToken.getNumericProperty("lly"); int pageNum = (int) firstRefToken.getNumericProperty("pageNum"); String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly; element.setAttribute("refID", persistentMentionID); referenceElements.add(element); refNum++; } } segmentations.put("referenceElements", referenceElements); return true; }