Java RxDocument 예제들

프로그래밍 언어: Java

네임스페이스/패키지 이름: org.rexo.pipeline.components

클래스/타입: RxDocument

hotexamples.com에서의 예제들: 3

Java RxDocument - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 org.rexo.pipeline.components.RxDocument에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

getScope(3)

docErrorString(1)

docInfoString(1)

getTokenization(1)

예제 #1

파일 보기

파일: MetatagPreconditionTestFilter.java 프로젝트: allenai/rexa1-metatagger

  public int accept(RxDocument rdoc) {
    //
    File corpusPath = (File) rdoc.getScope("document").get("corpus.path");
    File targetRootDirectory = (File) rdoc.getScope("session").get("target.root.directory");
    File statusFile0 =
        FileUtils.generateCorpusResourceFile(targetRootDirectory, corpusPath, ".tag.0");
    File statusFile1 =
        FileUtils.generateCorpusResourceFile(targetRootDirectory, corpusPath, ".tag.1");

    String corpusBaseFilename = FileUtils.getCorpusFilename(statusFile0);

    boolean doReprocess =
        ((Boolean) rdoc.getScope("session").get("reprocess.boolean")).booleanValue();

    // APD
    if (!doReprocess) {

      if (statusFile0.exists() || statusFile1.exists()) {
        getLogger(rdoc).warn("skipping '" + corpusBaseFilename + "': already tagged");
        // APD report an error message
        // rdoc.docErrorString("Already tagged");
        rdoc.docInfoString("Already tagged");
        return ReturnCode.ABORT_PAPER;
      }
    }

    if (!statusFile0.exists()) {
      statusFile0.getParentFile().mkdirs();
      try {
        if (!statusFile0.createNewFile()) {
          getLogger(rdoc)
              .error("aborting '" + corpusBaseFilename + "'; couldn't create status file");
          rdoc.docInfoString("STATUS FILE CREATION ERROR");
          return ReturnCode.ABORT_PAPER;
        }
      } catch (IOException e) {
        getLogger(rdoc).error("aborting '" + corpusBaseFilename + "'; " + e.getMessage());
        rdoc.docInfoString("STATUS FILE CREATION ERROR2");
        return ReturnCode.ABORT_PAPER;
      }
    }
    // TODO : need to grab a file lock for this to be safe

    incrementIntAttribute(rdoc, "session", "metric.documents.attempted.integer");
    // System.out.println( corpusPath.getName() + "..." );
    // APD
    System.out.println("Processing file: " + corpusPath.toString() + " ...");

    return ReturnCode.OK;
  }

예제 #2

파일 보기

파일: HtmlErrorLogFilter.java 프로젝트: allenai/rexa1-metatagger

  public int accept(RxDocument rdoc) {
    Iterator i;

    // Get the document's error list
    LinkedList docErrors = (LinkedList) rdoc.getScope("document").get("error.list");

    // No error strings registered for this document
    if (docErrors == null) {
      /*
      docErrors = new LinkedList();
      docErrors.add("unreported error");
      */
      return ReturnCode.OK;
    }

    URI fileURI = (URI) rdoc.getScope("document").get("corpus.absolute.uri");
    URI relURI = (URI) rdoc.getScope("document").get("corpus.relative.uri");

    // Get the relative path
    String tmp = relURI.getPath();

    // Chop off the final '.xml'
    tmp = tmp.substring(0, tmp.length() - 4);

    // Extract the original URL
    URL u;
    File f;
    try {
      f = new File(tmp);
      u = URLMangler.demangle(f);
    } catch (MalformedURLException e) {
      getLogger(rdoc).error("malformedURL: " + e);
      return ReturnCode.OK;
    }

    String xmllink = "<A href=\"" + fileURI.toString() + "\">" + fileURI.getPath() + "</A>";
    String pslink;

    // Hack -- add a hint about the file type for .ps.gz files.  This
    // is useful for getting mozilla to fire up ghostview properly.  Mileage
    // with other browsers may vary.

    if (tmp.endsWith(".ps.gz")) {
      pslink =
          "<A href=\"" + u.toString() + "\" type=application/postscript>" + u.toString() + "</A>";
    } else {
      pslink = "<A href=\"" + u.toString() + "\">" + u.toString() + "</A>";
    }

    for (i = docErrors.iterator(); i.hasNext(); ) {
      String errorType = (String) i.next();
      String logFileName = errorType;
      String logFileExt = ".err.html";

      // if the first error of this type we've seen...
      if (!_errorTypes.contains(errorType)) {
        _errorTypes.add(errorType);
        clearLogFile(rdoc, logFileName, logFileExt);
        htmlHeader(rdoc, logFileName, logFileExt);
      }

      appendLogFile(rdoc, logFileName, logFileExt, pslink + "<br>");
      appendLogFile(rdoc, logFileName, logFileExt, xmllink + "<br><br>");
    }

    return ReturnCode.OK;
  }

예제 #3

파일 보기

파일: ReferenceExtractionFilter.java 프로젝트: allenai/rexa1-metatagger

  /** @param rdoc */
  private boolean doExtraction(RxDocument rdoc) {
    NewHtmlTokenization tokenization = rdoc.getTokenization();
    ArrayList referenceElements = new ArrayList();
    Map segmentations = (Map) rdoc.getScope("document").get("segmentation");

    if (tokenization == null) {
      getLogger(rdoc).error("Partitioner found nothing to partition...");
      rdoc.docErrorString("Partitioner found nothing to partition");
      return false;
    }

    // Markup header
    if (_headersExtractor != null) {
      NewHtmlTokenization header = (NewHtmlTokenization) segmentations.get("headerTokenization");
      if (header != null) {
        log.info("running crf on header");
        if (header.clearTokenFeatures()) {
          log.warn("header tokens had features set before crf extraction");
        }

        Extraction extraction = _headersExtractor.extract(header);
        // log.info("done.");
        Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels();
        CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter();
        Element element = crfOutputFormatter.toXmlElement(header, predictedLabels, "headers");

        // Get the first token in the document
        PropertyHolder firstHeaderToken = header.getToken(0);
        // Get the token's position
        double llx = firstHeaderToken.getNumericProperty("llx");
        double lly = firstHeaderToken.getNumericProperty("lly");
        int pageNum = (int) firstHeaderToken.getNumericProperty("pageNum");
        String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly;
        element.setAttribute("headerID", persistentMentionID);
        segmentations.put("headerElement", element);
      }
    }

    // Markup references
    if (_referencesExtractor != null) {
      List refList = (List) segmentations.get("referenceList");
      if (refList == null) {
        getLogger(rdoc).error("no biblio to extract");
        rdoc.docErrorString("no biblio to extract");
        return false;
      }

      Iterator referenceIterator = refList.iterator();

      // For outputing full file paths in the reference warnings
      int refNum = 1;
      while (referenceIterator.hasNext()) {
        // Extract reference
        NewHtmlTokenization reference = (NewHtmlTokenization) referenceIterator.next();
        if (reference.clearTokenFeatures()) {
          log.warn("reference tokens had features set before crf extraction");
        }
        log.info("running crf on reference " + refNum + " of " + refList.size());
        Extraction extraction = _referencesExtractor.extract(reference);
        Sequence predictedLabels = extraction.getDocumentExtraction(0).getPredictedLabels();

        // Check extracted reference for validity
        String warning = checkReference(reference, predictedLabels);
        if (!warning.equals("")) {
          log.error("Suspicous reference (" + refNum + "):" + warning);
        }
        CRFOutputFormatter crfOutputFormatter = new CRFOutputFormatter();
        Element element = crfOutputFormatter.toXmlElement(reference, predictedLabels, "reference");

        // Get the first token in the reference
        PropertyHolder firstRefToken = (PropertyHolder) reference.getToken(0);
        // Get the token's position
        double llx = firstRefToken.getNumericProperty("llx");
        double lly = firstRefToken.getNumericProperty("lly");
        int pageNum = (int) firstRefToken.getNumericProperty("pageNum");
        String persistentMentionID = "p" + pageNum + "x" + llx + "y" + lly;
        element.setAttribute("refID", persistentMentionID);

        referenceElements.add(element);
        refNum++;
      }
    }
    segmentations.put("referenceElements", referenceElements);

    return true;
  }