@Test
  public void testGetDocumentCas()
      throws ResourceInitializationException, IOException, SAXException, URISyntaxException,
          ParserConfigurationException {
    CAS aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertEquals(6, CasUtil.selectAll(aCAS).size());
    assertEquals(
        1,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());

    aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5));
    assertEquals(
        0,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());
  }
Esempio n. 2
0
 /**
  * Called when the processing of a Document is completed. <br>
  * The process status can be looked at and corresponding actions taken.
  *
  * @param aCas CAS corresponding to the completed processing
  * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity
  */
 public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
   if (aStatus.isException()) {
     List<Exception> exceptions = aStatus.getExceptions();
     for (int i = 0; i < exceptions.size(); i++) {
       ((Throwable) exceptions.get(i)).printStackTrace();
     }
     return;
   }
   entityCount++;
   String docText = aCas.getDocumentText();
   if (docText != null) {
     size += docText.length();
   }
 }
Esempio n. 3
0
  @Test
  public void test() throws Exception {
    String html = "<Parent>\n";
    html += "<Child1>Some content</Child1>\n";
    html += "<Child2 attribute=“someValue” />\n";
    html += "<Child3>More content.</Child3>\n";
    html += "</Parent>\n";

    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
    if (urlA == null) {
      urlA =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
    }

    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
    if (urlC == null) {
      urlC =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
    }

    XMLInputSource inA = new XMLInputSource(urlA);
    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
    aeA.reconfigure();

    XMLInputSource inC = new XMLInputSource(urlC);
    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
    aeC.setConfigParameterValue(
        HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] {"child1", "child2", "child3"});
    aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
    aeC.reconfigure();

    CAS cas = aeA.newCAS();
    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
    AnnotationIndex<AnnotationFS> ai = null;
    FSIterator<AnnotationFS> iterator = null;

    cas.setDocumentText(html);
    aeA.process(cas);
    aeC.process(cas);

    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

    assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());

    ai = plainTextCas.getAnnotationIndex(tagType);
    iterator = ai.iterator();
    assertEquals(4, ai.size());
    assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
    assertEquals("$Some content", iterator.next().getCoveredText());
    assertEquals("$", iterator.next().getCoveredText());
    assertEquals("$More content.", iterator.next().getCoveredText());

    cas.release();
  }
Esempio n. 4
0
  @Test
  public void testExpandOffsets() throws Exception {
    String html = "<Parent>\n";
    html += "<Child1>Some content</Child1>\n";
    html += "<Child2 attribute=“someValue” />\n";
    html += "<Child3>More content.</Child3>\n";
    html += "</Parent>\n";

    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
    if (urlA == null) {
      urlA =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
    }

    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
    if (urlC == null) {
      urlC =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
    }

    XMLInputSource inA = new XMLInputSource(urlA);
    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
    aeA.reconfigure();

    XMLInputSource inC = new XMLInputSource(urlC);
    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
    aeC.reconfigure();

    CAS cas = aeA.newCAS();
    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
    Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
    AnnotationIndex<AnnotationFS> ai = null;
    FSIterator<AnnotationFS> iterator = null;

    cas.setDocumentText(html);
    aeA.process(cas);
    aeC.process(cas);

    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

    assertEquals("Some contentMore content.", plainTextCas.getDocumentText());

    ai = plainTextCas.getAnnotationIndex(tagType);
    iterator = ai.iterator();
    assertEquals(4, ai.size());
    AnnotationFS next = null;
    next = iterator.next();
    assertEquals(false, next.getBooleanValue(expandedFeature));
    assertEquals("Some contentMore content.", next.getCoveredText());
    next = iterator.next();
    assertEquals(false, next.getBooleanValue(expandedFeature));
    assertEquals("Some content", next.getCoveredText());
    next = iterator.next();
    boolean b1 = next.getBooleanValue(expandedFeature);
    assertEquals("More content.", next.getCoveredText());
    next = iterator.next();
    boolean b2 = next.getBooleanValue(expandedFeature);
    assertEquals("More content.", next.getCoveredText());
    // for one of these two annotation (with same offsets) the feature must be set to true
    assertEquals(true, b1 || b2);

    cas.release();
  }
  /** Copied and modified from {@link org.apache.uima.util.CasToInlineXml} */
  private static String toXML(CAS cas, AnnotationsToElements converter) throws SAXException {
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream, false);

    // get document text
    String docText = cas.getDocumentText();
    char[] docCharArray = docText.toCharArray();

    // get iterator over annotations sorted by increasing start position and
    // decreasing end position
    FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator();

    // This is basically a recursive algorithm that has had the recursion
    // removed through the use of an explicit Stack. We iterate over the
    // annotations, and if an annotation contains other annotations, we
    // push the parent annotation on the stack, process the children, and
    // then come back to the parent later.
    List<AnnotationFS> stack = new ArrayList<AnnotationFS>();
    int pos = 0;

    ContentHandler handler = sax2xml.getContentHandler();
    handler.startDocument();
    // write the start tag
    converter.startRootElement(handler);
    // now use null is a placeholder for this artificial Document annotation
    AnnotationFS curAnnot = null;

    while (iterator.isValid()) {
      AnnotationFS nextAnnot = iterator.get();

      if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) {
        // nextAnnot's start point is within the span of curAnnot
        if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check
        {
          // nextAnnot is contained within curAnnot

          // write text between current pos and beginning of nextAnnot
          try {
            handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos);
            pos = nextAnnot.getBegin();
            converter.startAnnotationElement(nextAnnot, handler);

            // push parent annotation on stack
            stack.add(curAnnot);
            // move on to next annotation
            curAnnot = nextAnnot;
          } catch (StringIndexOutOfBoundsException e) {
            System.err.println(
                "Invalid annotation range: "
                    + nextAnnot.getBegin()
                    + ","
                    + nextAnnot.getEnd()
                    + " in document of length "
                    + docText.length());
          }
        }
        iterator.moveToNext();
      } else {
        // nextAnnot begins after curAnnot ends
        // write text between current pos and end of curAnnot
        try {
          handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
          pos = curAnnot.getEnd();
        } catch (StringIndexOutOfBoundsException e) {
          System.err.println(
              "Invalid annotation range: "
                  + curAnnot.getBegin()
                  + ","
                  + curAnnot.getEnd()
                  + " in document of length "
                  + docText.length());
        }
        converter.endAnnotationElement(curAnnot, handler);

        // pop next containing annotation off stack
        curAnnot = stack.remove(stack.size() - 1);
      }
    }

    // finished writing all start tags, now finish up
    if (curAnnot != null) {
      try {
        handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
        pos = curAnnot.getEnd();
      } catch (StringIndexOutOfBoundsException e) {
        System.err.println(
            "Invalid annotation range: "
                + curAnnot.getBegin()
                + ","
                + curAnnot.getEnd()
                + "in document of length "
                + docText.length());
      }
      converter.endAnnotationElement(curAnnot, handler);

      while (!stack.isEmpty()) {
        curAnnot = stack.remove(stack.size() - 1); // pop
        if (curAnnot == null) {
          break;
        }
        try {
          handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
          pos = curAnnot.getEnd();
        } catch (StringIndexOutOfBoundsException e) {
          System.err.println(
              "Invalid annotation range: "
                  + curAnnot.getBegin()
                  + ","
                  + curAnnot.getEnd()
                  + "in document of length "
                  + docText.length());
        }
        converter.endAnnotationElement(curAnnot, handler);
      }
    }

    if (pos < docCharArray.length) {
      handler.characters(docCharArray, pos, docCharArray.length - pos);
    }
    converter.endRootElement(handler);
    handler.endDocument();

    // return XML string
    return new String(byteArrayOutputStream.toByteArray());
  }
Esempio n. 6
0
    /**
     * Called when the processing of a Document is completed. <br>
     * The process status can be looked at and corresponding actions taken.
     *
     * @param aCas CAS corresponding to the completed processing
     * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity
     */
    public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
      if (aStatus != null) {
        if (aStatus.isException()) {
          System.err.println("Error on process CAS call to remote service:");
          List exceptions = aStatus.getExceptions();
          for (int i = 0; i < exceptions.size(); i++) {
            ((Throwable) exceptions.get(i)).printStackTrace();
          }
          if (!ignoreErrors) {
            System.err.println("Terminating Client...");
            stop();
          }
        }
        if (logCas) {
          String ip = "no IP";
          List eList = aStatus.getProcessTrace().getEventsByComponentName("UimaEE", false);
          for (int e = 0; e < eList.size(); e++) {
            ProcessTraceEvent event = (ProcessTraceEvent) eList.get(e);
            if (event.getDescription().equals("Service IP")) {
              ip = event.getResultMessage();
            }
          }
          String casId = ((UimaASProcessStatus) aStatus).getCasReferenceId();
          if (casId != null) {
            long current = System.nanoTime() / 1000000 - mStartTime;
            if (casMap.containsKey(casId)) {
              Object value = casMap.get(casId);
              if (value != null && value instanceof Long) {
                long start = ((Long) value).longValue();
                System.out.println(ip + "\t" + start + "\t" + (current - start));
              }
            }
          }

        } else {
          System.out.print(".");
          if (0 == (entityCount + 1) % 50) {
            System.out.print((entityCount + 1) + " processed\n");
          }
        }
      }

      // if output dir specified, dump CAS to XMI
      if (outputDir != null) {
        // try to retrieve the filename of the input file from the CAS
        File outFile = null;
        Type srcDocInfoType =
            aCas.getTypeSystem().getType("org.apache.uima.examples.SourceDocumentInformation");
        if (srcDocInfoType != null) {
          FSIterator it = aCas.getIndexRepository().getAllIndexedFS(srcDocInfoType);
          if (it.hasNext()) {
            FeatureStructure srcDocInfoFs = it.get();
            Feature uriFeat = srcDocInfoType.getFeatureByBaseName("uri");
            Feature offsetInSourceFeat = srcDocInfoType.getFeatureByBaseName("offsetInSource");
            String uri = srcDocInfoFs.getStringValue(uriFeat);
            int offsetInSource = srcDocInfoFs.getIntValue(offsetInSourceFeat);
            File inFile;
            try {
              inFile = new File(new URL(uri).getPath());
              String outFileName = inFile.getName();
              if (offsetInSource > 0) {
                outFileName += ("_" + offsetInSource);
              }
              outFileName += ".xmi";
              outFile = new File((String) outputDir, outFileName);
            } catch (MalformedURLException e1) {
              // invalid URI, use default processing below
            }
          }
        }
        if (outFile == null) {
          outFile = new File((String) outputDir, "doc" + entityCount);
        }
        try {
          FileOutputStream outStream = new FileOutputStream(outFile);
          try {
            XmiCasSerializer.serialize(aCas, outStream);
          } finally {
            outStream.close();
          }
        } catch (Exception e) {
          System.err.println("Could not save CAS to XMI file");
          e.printStackTrace();
        }
      }

      // update stats
      entityCount++;
      String docText = aCas.getDocumentText();
      if (docText != null) {
        size += docText.length();
      }

      // Called just before sendCas with next CAS from collection reader
    }