コード例 #1
0
  // Analyze the predicted labels for a reference in an attempt to find improperly segmented
  // (i.e. too long) references.  If a suspect reference is found, return a truncated
  // NewHtmlTokenization;
  // otherwise return null.
  // private NewHtmlTokenization fixReference(NewHtmlTokenization tokens, Sequence predictedTags) {
  private String checkReference(NewHtmlTokenization tokens, Sequence predictedTags) {
    assert tokens.size() == predictedTags.size();

    boolean seenMarker = false;
    boolean seenAuthors = false;
    boolean seenTitle = false;
    String warning = "";

    String previousTag = "";

    for (int i = 0; i < predictedTags.size(); i++) {
      String tag = predictedTags.get(i).toString();
      boolean truncateHere = false;

      if (previousTag.startsWith("ref-marker") && !tag.startsWith("ref-marker")) {
        seenMarker = true;
      }
      if (previousTag.startsWith("author") && !tag.startsWith("author")) {
        seenAuthors = true;
      }
      if (previousTag.startsWith("title") && !tag.startsWith("title")) {
        seenTitle = true;
      }
      // FIXME: should we really truncate on duplicate ref-marker if we haven't seen authors or
      // title yet?
      boolean newMarker = (tag.startsWith("ref-marker") && !previousTag.startsWith("ref-marker"));
      if ((seenMarker || seenAuthors || seenTitle) && newMarker) {
        truncateHere = true;
        warning = warning + "duplicate ref-marker;";
      }
      boolean newAuthor = (tag.startsWith("author") && !previousTag.startsWith("author"));
      if (seenAuthors && newAuthor) {
        truncateHere = true;
        warning = warning + "duplicate authors;";
      }
      boolean newTitle = (tag.startsWith("title") && !previousTag.startsWith("title"));
      if (seenTitle && newTitle) {
        truncateHere = true;
        warning = warning + "duplicate title;";
      }

      previousTag = tag;
    }
    return warning;
  }
コード例 #2
0
  /** @param rdoc */
  private boolean doExtraction(RxDocument rdoc) {
    NewHtmlTokenization tokenization = rdoc.getTokenization();
    ArrayList referenceElements = new ArrayList();
    Map segmentations = (Map) rdoc.getScope("document").get("segmentation");

    if (tokenization == null) {
      getLogger(rdoc).error("Partitioner found nothing to partition...");
      rdoc.docErrorString("Partitioner found nothing to partition");
      return false;
    }

    // Markup header
    if (_headersExtractor != null) {
      NewHtmlTokenization header = (NewHtmlTokenization) segmentations.get("headerTokenization");
      if (header != null) {
        log.info("running crf on header");
        if (header.clearTokenFeatures()) {
          log.warn("header tokens had features set before crf extraction");
        }

        Extraction extraction = _headersExtractor.extract(header);
        // log.info("done.");
        Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels();
        CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter();
        Element element = crfOutputFormatter.toXmlElement(header, predictedLabels, "headers");

        // Get the first token in the document
        PropertyHolder firstHeaderToken = header.getToken(0);
        // Get the token's position
        double llx = firstHeaderToken.getNumericProperty("llx");
        double lly = firstHeaderToken.getNumericProperty("lly");
        int pageNum = (int) firstHeaderToken.getNumericProperty("pageNum");
        String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly;
        element.setAttribute("headerID", persistentMentionID);
        segmentations.put("headerElement", element);
      }
    }

    // Markup references
    if (_referencesExtractor != null) {
      List refList = (List) segmentations.get("referenceList");
      if (refList == null) {
        getLogger(rdoc).error("no biblio to extract");
        rdoc.docErrorString("no biblio to extract");
        return false;
      }

      Iterator referenceIterator = refList.iterator();

      // For outputing full file paths in the reference warnings
      int refNum = 1;
      while (referenceIterator.hasNext()) {
        // Extract reference
        NewHtmlTokenization reference = (NewHtmlTokenization) referenceIterator.next();
        if (reference.clearTokenFeatures()) {
          log.warn("reference tokens had features set before crf extraction");
        }
        log.info("running crf on reference " + refNum + " of " + refList.size());
        Extraction extraction = _referencesExtractor.extract(reference);
        Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels();

        // Check extracted reference for validity
        String warning = checkReference(reference, predictedLabels);
        if (!warning.equals("")) {
          log.error("Suspicous reference (" + refNum + "):" + warning);
        }
        CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter();
        Element element = crfOutputFormatter.toXmlElement(reference, predictedLabels, "reference");

        // Get the first token in the reference
        PropertyHolder firstRefToken = (PropertyHolder) reference.getToken(0);
        // Get the token's position
        double llx = firstRefToken.getNumericProperty("llx");
        double lly = firstRefToken.getNumericProperty("lly");
        int pageNum = (int) firstRefToken.getNumericProperty("pageNum");
        String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly;
        element.setAttribute("refID", persistentMentionID);

        referenceElements.add(element);
        refNum++;
      }
    }
    segmentations.put("referenceElements", referenceElements);

    return true;
  }