@Override
  protected void endDocument(final PDDocument aPdf) throws IOException {
    cas.setDocumentText(text.toString());

    if (log.isTraceEnabled()) {
      log.trace("</document>");
    }
  }
  @Override
  public void getNext(final CAS aCAS) throws IOException, CollectionException {
    TikaProcessor processor = new TikaProcessor();
    try {
      processor = TikaProcessor.newInstance(file);
    } catch (Exception e) {
      ExceptionHandler.logAndRethrow(logger, "TikaProcessor: ", e);
    }

    String documentText = processor.getText();
    if (documentText == null || documentText.length() == 0) {
      ExceptionHandler.logAndThrow(logger, "Document text is null or empty");
    }
    aCAS.setDocumentText(documentText);

    String textLanguage = processor.getLanguage();
    if (!textLanguage.contains("ru")) {
      ExceptionHandler.logAndThrow(logger, "Document language is not russian");
    }
    aCAS.setDocumentLanguage(textLanguage);
  }
Exemple #3
0
  public void run() throws Exception {
    //		create Asynchronous Client API and initialize it
    uimaAsEngine = new BaseUIMAAsynchronousEngine_impl();

    //		callback
    //		uimaAsEngine.addStatusCallbackListener(new StatusCallbackListener());

    initializeUimaAsEngine(uimaAsEngine);

    String filePath =
        "C:\\WebScience\\Progetti\\K-People\\OntologyController_UIMA\\apache-uima\\examples\\src\\it\\webscience\\uima\\event-2031.xml";
    String xml = readFile(filePath);

    // 		get an empty CAS from the Cas pool
    CAS cas = uimaAsEngine.getCAS();

    //		Initialize it with input data
    cas.setDocumentText(xml);

    //		Send Cas to service for processing
    uimaAsEngine.sendCAS(cas);
  }
  @Override
  public void getNext(CAS cas) throws IOException, CollectionException {
    this.cumulatedLength += currentDoc.getText().length();
    logger.info(
        "[Stream {}] Processing document {}: {} (total length processed: {})",
        this.streamName,
        this.mCurrentIndex,
        this.currentDoc.getUri(),
        this.cumulatedLength);

    SourceDocumentInformation sdi;
    try {

      sdi = new SourceDocumentInformation(cas.getJCas());
      sdi.setUri(currentDoc.getUri());
      cas.setDocumentLanguage(mLanguage.getCode());
      cas.setDocumentText(currentDoc.getText());
      sdi.setDocumentSize(currentDoc.getText().length());
      sdi.setCumulatedDocumentSize(this.cumulatedLength);
      sdi.setBegin(0);
      sdi.setEnd(currentDoc.getText().length());
      sdi.setOffsetInSource(0);
      sdi.setDocumentIndex(mCurrentIndex);

      /*
       * Cannot be known in case of streaming
       */
      sdi.setCorpusSize(-1);
      sdi.setNbDocuments(-1);

      // Cannot know if this is the last
      sdi.setLastSegment(false);

      sdi.addToIndexes();
      this.mCurrentIndex++;
    } catch (CASException e) {
      throw new CollectionException(e);
    }
  }
  public static void main(String[] args) throws Exception {

    String sLine;
    long startTime = System.currentTimeMillis();

    URL descUrl =
        VectorSpaceRetrieval.class.getResource(
            "/descriptors/retrievalsystem/VectorSpaceRetrieval.xml");
    if (descUrl == null) {
      throw new IllegalArgumentException("Error opening VectorSpaceRetrieval.xml");
    }
    // create AnalysisEngine
    XMLInputSource input = new XMLInputSource(descUrl);
    AnalysisEngineDescription desc =
        UIMAFramework.getXMLParser().parseAnalysisEngineDescription(input);
    AnalysisEngine anAnalysisEngine = UIMAFramework.produceAnalysisEngine(desc);
    CAS aCas = anAnalysisEngine.newCAS();

    URL docUrl = VectorSpaceRetrieval.class.getResource("/data/documents.txt");
    if (docUrl == null) {
      throw new IllegalArgumentException("Error opening data/documents.txt");
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(docUrl.openStream()));
    while ((sLine = br.readLine()) != null) {
      aCas.setDocumentText(sLine);
      anAnalysisEngine.process(aCas);
      aCas.reset();
    }
    br.close();
    br = null;
    anAnalysisEngine.collectionProcessComplete();
    anAnalysisEngine.destroy();
    long endTime = System.currentTimeMillis();

    double totalTime = (endTime - startTime) / 1000.0;
    System.out.println("Total time taken: " + totalTime);
  }
  @Test
  public void test() throws Exception {
    String html = "<Parent>\n";
    html += "<Child1>Some content</Child1>\n";
    html += "<Child2 attribute=“someValue” />\n";
    html += "<Child3>More content.</Child3>\n";
    html += "</Parent>\n";

    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
    if (urlA == null) {
      urlA =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
    }

    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
    if (urlC == null) {
      urlC =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
    }

    XMLInputSource inA = new XMLInputSource(urlA);
    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
    aeA.reconfigure();

    XMLInputSource inC = new XMLInputSource(urlC);
    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
    aeC.setConfigParameterValue(
        HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] {"child1", "child2", "child3"});
    aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
    aeC.reconfigure();

    CAS cas = aeA.newCAS();
    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
    AnnotationIndex<AnnotationFS> ai = null;
    FSIterator<AnnotationFS> iterator = null;

    cas.setDocumentText(html);
    aeA.process(cas);
    aeC.process(cas);

    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

    assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());

    ai = plainTextCas.getAnnotationIndex(tagType);
    iterator = ai.iterator();
    assertEquals(4, ai.size());
    assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
    assertEquals("$Some content", iterator.next().getCoveredText());
    assertEquals("$", iterator.next().getCoveredText());
    assertEquals("$More content.", iterator.next().getCoveredText());

    cas.release();
  }
  @Test
  public void testExpandOffsets() throws Exception {
    String html = "<Parent>\n";
    html += "<Child1>Some content</Child1>\n";
    html += "<Child2 attribute=“someValue” />\n";
    html += "<Child3>More content.</Child3>\n";
    html += "</Parent>\n";

    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
    if (urlA == null) {
      urlA =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
    }

    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
    if (urlC == null) {
      urlC =
          HtmlAnnotator.class
              .getClassLoader()
              .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
    }

    XMLInputSource inA = new XMLInputSource(urlA);
    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
    aeA.reconfigure();

    XMLInputSource inC = new XMLInputSource(urlC);
    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
    aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
    aeC.reconfigure();

    CAS cas = aeA.newCAS();
    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
    Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
    AnnotationIndex<AnnotationFS> ai = null;
    FSIterator<AnnotationFS> iterator = null;

    cas.setDocumentText(html);
    aeA.process(cas);
    aeC.process(cas);

    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

    assertEquals("Some contentMore content.", plainTextCas.getDocumentText());

    ai = plainTextCas.getAnnotationIndex(tagType);
    iterator = ai.iterator();
    assertEquals(4, ai.size());
    AnnotationFS next = null;
    next = iterator.next();
    assertEquals(false, next.getBooleanValue(expandedFeature));
    assertEquals("Some contentMore content.", next.getCoveredText());
    next = iterator.next();
    assertEquals(false, next.getBooleanValue(expandedFeature));
    assertEquals("Some content", next.getCoveredText());
    next = iterator.next();
    boolean b1 = next.getBooleanValue(expandedFeature);
    assertEquals("More content.", next.getCoveredText());
    next = iterator.next();
    boolean b2 = next.getBooleanValue(expandedFeature);
    assertEquals("More content.", next.getCoveredText());
    // for one of these two annotation (with same offsets) the feature must be set to true
    assertEquals(true, b1 || b2);

    cas.release();
  }