@Test
  public void testGetDocumentCas()
      throws ResourceInitializationException, IOException, SAXException, URISyntaxException,
          ParserConfigurationException {
    CAS aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertEquals(6, CasUtil.selectAll(aCAS).size());
    assertEquals(
        1,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());

    aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5));
    assertEquals(
        0,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());
  }
  /** Performs name finding on the given cas object. */
  public final void process(CAS cas) {

    if (isRemoveExistingAnnotations) {
      final AnnotationComboIterator sentenceNameCombo =
          new AnnotationComboIterator(cas, mSentenceType, mNameType);

      List<AnnotationFS> removeAnnotations = new LinkedList<AnnotationFS>();
      for (AnnotationIteratorPair annotationIteratorPair : sentenceNameCombo) {
        for (AnnotationFS nameAnnotation : annotationIteratorPair.getSubIterator()) {
          removeAnnotations.add(nameAnnotation);
        }
      }

      for (AnnotationFS annotation : removeAnnotations) {
        cas.removeFsFromIndexes(annotation);
      }
    }

    final AnnotationComboIterator sentenceTokenCombo =
        new AnnotationComboIterator(cas, mSentenceType, mTokenType);

    for (AnnotationIteratorPair annotationIteratorPair : sentenceTokenCombo) {

      final List<AnnotationFS> sentenceTokenAnnotationList = new LinkedList<AnnotationFS>();

      final List<String> sentenceTokenList = new LinkedList<String>();

      for (AnnotationFS tokenAnnotation : annotationIteratorPair.getSubIterator()) {

        sentenceTokenAnnotationList.add(tokenAnnotation);

        sentenceTokenList.add(tokenAnnotation.getCoveredText());
      }

      Span[] names =
          find(cas, (String[]) sentenceTokenList.toArray(new String[sentenceTokenList.size()]));

      AnnotationFS nameAnnotations[] = new AnnotationFS[names.length];

      for (int i = 0; i < names.length; i++) {

        int startIndex =
            ((AnnotationFS) sentenceTokenAnnotationList.get(names[i].getStart())).getBegin();

        int endIndex =
            ((AnnotationFS) sentenceTokenAnnotationList.get(names[i].getEnd() - 1)).getEnd();

        nameAnnotations[i] = cas.createAnnotation(mNameType, startIndex, endIndex);

        cas.getIndexRepository().addFS(nameAnnotations[i]);
      }

      postProcessAnnotations(names, nameAnnotations);
    }

    documentDone(cas);
  }
Пример #3
0
  @Test
  public void test() {

    CAS cas = RutaTestUtils.processTestScript(this.getClass());

    RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "A b");
    RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "b A");
    RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "A b A");

    cas.release();
  }
  @Override
  public synchronized String nextSentence() {
    if (sentences == null || !sentences.hasNext()) {
      try {
        if (getReader().hasNext()) {
          CAS cas = resource.retrieve();

          try {
            getReader().getNext(cas);
          } catch (Exception e) {
            log.warn("Done iterating returning an empty string");
            return "";
          }

          resource.getAnalysisEngine().process(cas);

          List<String> list = new ArrayList<>();
          for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
            list.add(sentence.getCoveredText());
          }

          sentences = list.iterator();
          // needs to be next cas
          while (!sentences.hasNext()) {
            // sentence is empty; go to another cas
            if (reader.hasNext()) {
              cas.reset();
              getReader().getNext(cas);
              resource.getAnalysisEngine().process(cas);
              for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
                list.add(sentence.getCoveredText());
              }
              sentences = list.iterator();
            } else return null;
          }

          String ret = sentences.next();
          if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret);
          return ret;
        }

        return null;

      } catch (Exception e) {
        throw new RuntimeException(e);
      }

    } else {
      String ret = sentences.next();
      if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret);
      return ret;
    }
  }
  /** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    File file = (File) mFiles.get(mCurrentIndex++);
    String text = FileUtils.file2String(file, mEncoding);
    // put document in CAS
    jcas.setDocumentText(text);

    // set language if it was explicitly specified as a configuration parameter
    if (mLanguage != null) {
      jcas.setDocumentLanguage(mLanguage);
    }

    // Also store location of source document in CAS. This information is critical
    // if CAS Consumers will need to know where the original document contents are located.
    // For example, the Semantic Search CAS Indexer writes this information into the
    // search index that it creates, which allows applications that use the search index to
    // locate the documents that satisfy their semantic queries.
    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
    srcDocInfo.setOffsetInSource(0);
    srcDocInfo.setDocumentSize((int) file.length());
    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
    srcDocInfo.addToIndexes();
  }
Пример #6
0
  private Type getInputType(CAS cas, String typeName) {
    Type type = cas.getTypeSystem().getType(typeName);
    if (type == null) {
      throw new IllegalStateException("Type [" + typeName + "] not found in type system");
    }

    return type;
  }
Пример #7
0
  @Override
  protected void endDocument(final PDDocument aPdf) throws IOException {
    cas.setDocumentText(text.toString());

    if (log.isTraceEnabled()) {
      log.trace("</document>");
    }
  }
Пример #8
0
  @Override
  protected void generateAnnotations(JCas jcas)
      throws AnalysisEngineProcessException, FeaturePathException {
    // CAS is necessary to retrieve values
    CAS currCAS = jcas.getCas();

    for (String path : paths) {

      // Separate Typename and featurepath
      String[] segments = path.split("/", 2);
      String typeName = segments[0];

      // Try to get the type from the typesystem of the CAS
      Type t = currCAS.getTypeSystem().getType(typeName);
      if (t == null) {
        throw new IllegalStateException("Type [" + typeName + "] not found in type system");
      }

      // get an fpi object and initialize it
      // initialize the FeaturePathInfo with the corresponding part
      initializeFeaturePathInfoFrom(fp, segments);

      // get the annotations
      AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t);
      FSIterator<?> iterator = idx.iterator();

      while (iterator.hasNext()) {
        AnnotationFS fs = (AnnotationFS) iterator.next();

        try {
          if (this.filterFeaturePath != null) {
            // check annotation filter condition
            if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) {
              createStemAnnotation(jcas, fs);
            }
          } else { // no annotation filter specified
            createStemAnnotation(jcas, fs);
          }
        } catch (AnalysisEngineProcessException e) {
          // TODO Auto-generated catch block
          throw new IllegalStateException("error occured while creating a stem annotation", e);
        }
      }
    }
  }
Пример #9
0
  /**
   * Removes all annotations of type removeAnnotationType which are contained by annotations of type
   * containerAnnotationType.
   *
   * @param cas
   * @param containerAnnotationType
   * @param removeAnnotationType
   */
  public static void removeAnnotations(
      CAS cas, AnnotationFS containerAnnotation, Type removeAnnotationType) {

    FSIndex<AnnotationFS> allRemoveAnnotations = cas.getAnnotationIndex(removeAnnotationType);

    ContainingConstraint containingConstraint = new ContainingConstraint(containerAnnotation);

    Iterator<AnnotationFS> containingTokens =
        cas.createFilteredIterator(allRemoveAnnotations.iterator(), containingConstraint);

    Collection<AnnotationFS> removeAnnotations = new LinkedList<AnnotationFS>();

    while (containingTokens.hasNext()) {
      removeAnnotations.add(containingTokens.next());
    }

    for (Iterator<AnnotationFS> it = removeAnnotations.iterator(); it.hasNext(); ) {
      cas.removeFsFromIndexes(it.next());
    }
  }
  @Override
  public void getNext(final CAS aCAS) throws IOException, CollectionException {
    TikaProcessor processor = new TikaProcessor();
    try {
      processor = TikaProcessor.newInstance(file);
    } catch (Exception e) {
      ExceptionHandler.logAndRethrow(logger, "TikaProcessor: ", e);
    }

    String documentText = processor.getText();
    if (documentText == null || documentText.length() == 0) {
      ExceptionHandler.logAndThrow(logger, "Document text is null or empty");
    }
    aCAS.setDocumentText(documentText);

    String textLanguage = processor.getLanguage();
    if (!textLanguage.contains("ru")) {
      ExceptionHandler.logAndThrow(logger, "Document language is not russian");
    }
    aCAS.setDocumentLanguage(textLanguage);
  }
Пример #11
0
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    try {
      // parse the xml file
      File xmlFile = GlobalFileStorage.getInstance().poll();

      System.out.println("Process file: " + xmlFile.getName());

      SAXParserFactory spf = SAXParserFactory.newInstance();
      SAXParser sp = spf.newSAXParser();
      XMLReader xr = sp.getXMLReader();

      LinkedList<String[]> textElements = new LinkedList<>();
      FragmentContentHandler fch = new FragmentContentHandler(xr, textElements);
      xr.setContentHandler(fch);
      xr.parse(new InputSource(new FileInputStream(xmlFile)));

      StringBuilder docText = new StringBuilder();

      for (String[] element : textElements) {

        int start = docText.length();
        int end = start + element[1].length();

        docText.append(element[1] + "\n\n");

        Section section = new Section(jcas, start, end);
        section.setValue(element[0]);
        section.addToIndexes();
      }

      jcas.setDocumentText(docText.toString().trim());
      jcas.setDocumentLanguage(language);

      DocumentMetaData docMetaData = DocumentMetaData.create(aCAS);
      docMetaData.setDocumentTitle(xmlFile.getName());
      docMetaData.setDocumentId(xmlFile.getAbsolutePath());
      docMetaData.setDocumentBaseUri("file:" + xmlFile.getParentFile().getAbsolutePath());
      docMetaData.setDocumentUri("file:" + xmlFile.getAbsolutePath());

    } catch (Exception e) {
      // e.printStackTrace();
      throw new CollectionException(e);
    }
  }
Пример #12
0
  /*
   * Method that reads all serialized cases in the specified folder
   * param: path --> specifies the folder
   * returns a list of jcases
   * */
  public List<JCas> read(String path) throws Exception {
    List<JCas> jCases = new ArrayList<JCas>();
    System.out.println("--- READING ---");
    @SuppressWarnings("deprecation")
    CollectionReader reader =
        CollectionReaderFactory.createReader(
            BinaryCasReader.class,
            ResourceCollectionReaderBase.PARAM_PATH,
            path,
            ResourceCollectionReaderBase.PARAM_PATTERNS,
            new String[] {ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.bin"});

    while (reader.hasNext()) {
      CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null);
      reader.getNext(cas);
      System.out.println(cas.getJCas().getDocumentText());

      jCases.add(cas.getJCas());
    }

    return jCases;
  }
Пример #13
0
  public void run() throws Exception {
    //		create Asynchronous Client API and initialize it
    uimaAsEngine = new BaseUIMAAsynchronousEngine_impl();

    //		callback
    //		uimaAsEngine.addStatusCallbackListener(new StatusCallbackListener());

    initializeUimaAsEngine(uimaAsEngine);

    String filePath =
        "C:\\WebScience\\Progetti\\K-People\\OntologyController_UIMA\\apache-uima\\examples\\src\\it\\webscience\\uima\\event-2031.xml";
    String xml = readFile(filePath);

    // 		get an empty CAS from the Cas pool
    CAS cas = uimaAsEngine.getCAS();

    //		Initialize it with input data
    cas.setDocumentText(xml);

    //		Send Cas to service for processing
    uimaAsEngine.sendCAS(cas);
  }
  @Override
  public void getNext(CAS cas) throws IOException, CollectionException {
    this.cumulatedLength += currentDoc.getText().length();
    logger.info(
        "[Stream {}] Processing document {}: {} (total length processed: {})",
        this.streamName,
        this.mCurrentIndex,
        this.currentDoc.getUri(),
        this.cumulatedLength);

    SourceDocumentInformation sdi;
    try {

      sdi = new SourceDocumentInformation(cas.getJCas());
      sdi.setUri(currentDoc.getUri());
      cas.setDocumentLanguage(mLanguage.getCode());
      cas.setDocumentText(currentDoc.getText());
      sdi.setDocumentSize(currentDoc.getText().length());
      sdi.setCumulatedDocumentSize(this.cumulatedLength);
      sdi.setBegin(0);
      sdi.setEnd(currentDoc.getText().length());
      sdi.setOffsetInSource(0);
      sdi.setDocumentIndex(mCurrentIndex);

      /*
       * Cannot be known in case of streaming
       */
      sdi.setCorpusSize(-1);
      sdi.setNbDocuments(-1);

      // Cannot know if this is the last
      sdi.setLastSegment(false);

      sdi.addToIndexes();
      this.mCurrentIndex++;
    } catch (CASException e) {
      throw new CollectionException(e);
    }
  }
 private TypeSystem createTypeSystem() throws IOException, UIMAException {
   TypeSystemDescription tsDesc = null;
   if (typeSystemDescPaths != null && typeSystemDescPaths.length > 0) {
     tsDesc = createTypeSystemDescriptionFromPath(typeSystemDescPaths);
   }
   if (typeSystemDescNames != null && typeSystemDescNames.length > 0) {
     TypeSystemDescription tsDescFromNames = createTypeSystemDescription(typeSystemDescNames);
     if (tsDesc != null) {
       tsDesc = mergeTypeSystems(asList(tsDesc, tsDescFromNames));
     } else {
       tsDesc = tsDescFromNames;
     }
   }
   if (tsDesc == null) {
     log.info("TypeSystemDescription will be created using the UIMAFit discovery");
     tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription();
   }
   CAS dumbCas = CasCreationUtils.createCas(tsDesc, null, null);
   TypeSystem typeSystem = dumbCas.getTypeSystem();
   // printAllTypes();
   return typeSystem;
 }
Пример #16
0
  /**
   * TODO :: 1. construct the global word dictionary 2. keep the word frequency for each sentence
   *
   * <p>Creates two dictionaries queryDictionary and answerDictionary
   *
   * <p>queryDictionary is list of maps with key as the words in the question and value as the count
   * of the word in the question sentence. Similarly answerDictionary is list of maps with key as
   * the words in the answer and value as the count of the word in the answer sentence.
   */
  @Override
  public void processCas(CAS aCas) throws ResourceProcessException {

    JCas jcas;
    try {
      jcas = aCas.getJCas();
    } catch (CASException e) {
      throw new ResourceProcessException(e);
    }

    FSIterator it = jcas.getAnnotationIndex(Document.type).iterator();

    if (it.hasNext()) {
      Document doc = (Document) it.next();

      // Make sure that your previous annotators have populated this in CAS
      FSList fsTokenList = doc.getTokenList();
      ArrayList<Token> tokenList = Utils.fromFSListToCollection(fsTokenList, Token.class);

      HashMap<String, Integer> myMap = new HashMap<String, Integer>();
      HashMap<String, Integer> myMap2 = new HashMap<String, Integer>();

      // if question then fill QuesqIdList, QuesrelList & queryDictionary
      if (doc.getRelevanceValue() == 99) {
        QuesqIdList.add(doc.getQueryID());
        QuesrelList.add(doc.getRelevanceValue());
        for (int k = 0; k < tokenList.size(); k++) {
          myMap.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency());
        }
        queryDictionary.add(myMap);
      }
      // if answer then fill AnsqIdList, AnsrelList & answerDictionary
      else {
        AnsqIdList.add(doc.getQueryID());
        AnsrelList.add(doc.getRelevanceValue());
        for (int k = 0; k < tokenList.size(); k++) {
          myMap2.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency());
        }
        answerDictionary.add(myMap2);
        if (1 == doc.getRelevanceValue()) {
          GoldAnswerStringList.put(doc.getQueryID(), doc.getText());
        }
      }

      // Do something useful here

      /*for(int i=0;i<tokenList.size();i++)
        System.out.print(tokenList.get(i).getText().toString()+"=>" + tokenList.get(i).getFrequency()+"\t");
      System.out.println();*/
    }
  }
Пример #17
0
 /**
  * Called when the processing of a Document is completed. <br>
  * The process status can be looked at and corresponding actions taken.
  *
  * @param aCas CAS corresponding to the completed processing
  * @param aStatus EntityProcessStatus that holds the status of all the events for aEntity
  */
 public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
   if (aStatus.isException()) {
     List<Exception> exceptions = aStatus.getExceptions();
     for (int i = 0; i < exceptions.size(); i++) {
       ((Throwable) exceptions.get(i)).printStackTrace();
     }
     return;
   }
   entityCount++;
   String docText = aCas.getDocumentText();
   if (docText != null) {
     size += docText.length();
   }
 }
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;

    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    fillJCas(jcas);

    // give an indicator that a file has been processed
    System.err.print(".");
  }
Пример #19
0
  /**
   * Serialize a CAS to a file in XCAS format
   *
   * @param aCas CAS to serialize
   * @param name output file
   * @throws IOException if an I/O failure occurs
   * @throws SAXException if an error occurs generating the XML text
   */
  private void writeXCas(CAS aCas, File name) throws IOException, SAXException {
    FileOutputStream out = null;

    try {
      out = new FileOutputStream(name);
      XCASSerializer ser = new XCASSerializer(aCas.getTypeSystem());
      XMLSerializer xmlSer = new XMLSerializer(out, true);
      ser.serialize(aCas, xmlSer.getContentHandler());
    } finally {
      if (out != null) {
        out.close();
      }
    }
  }
  public static void main(String[] args) throws Exception {

    String sLine;
    long startTime = System.currentTimeMillis();

    URL descUrl =
        VectorSpaceRetrieval.class.getResource(
            "/descriptors/retrievalsystem/VectorSpaceRetrieval.xml");
    if (descUrl == null) {
      throw new IllegalArgumentException("Error opening VectorSpaceRetrieval.xml");
    }
    // create AnalysisEngine
    XMLInputSource input = new XMLInputSource(descUrl);
    AnalysisEngineDescription desc =
        UIMAFramework.getXMLParser().parseAnalysisEngineDescription(input);
    AnalysisEngine anAnalysisEngine = UIMAFramework.produceAnalysisEngine(desc);
    CAS aCas = anAnalysisEngine.newCAS();

    URL docUrl = VectorSpaceRetrieval.class.getResource("/data/documents.txt");
    if (docUrl == null) {
      throw new IllegalArgumentException("Error opening data/documents.txt");
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(docUrl.openStream()));
    while ((sLine = br.readLine()) != null) {
      aCas.setDocumentText(sLine);
      anAnalysisEngine.process(aCas);
      aCas.reset();
    }
    br.close();
    br = null;
    anAnalysisEngine.collectionProcessComplete();
    anAnalysisEngine.destroy();
    long endTime = System.currentTimeMillis();

    double totalTime = (endTime - startTime) / 1000.0;
    System.out.println("Total time taken: " + totalTime);
  }
Пример #21
0
  /**
   * Serialize a CAS to a file in XMI format
   *
   * @param aCas CAS to serialize
   * @param name output file
   * @throws SAXException -
   * @throws Exception -
   * @throws ResourceProcessException -
   */
  private void writeXmi(CAS aCas, File name, String modelFileName)
      throws IOException, SAXException {
    FileOutputStream out = null;

    try {
      // write XMI
      out = new FileOutputStream(name);
      XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem());
      XMLSerializer xmlSer = new XMLSerializer(out, false);
      ser.serialize(aCas, xmlSer.getContentHandler());
    } finally {
      if (out != null) {
        out.close();
      }
    }
  }
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    // TODO Auto-generated method stub
    JCas jcas;

    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    String sentence = mSentences.get(mCurrentIndex++);

    // put document in CAS
    jcas.setDocumentText(sentence);
  }
Пример #23
0
  /**
   * Gets the next sentence from the input file.
   *
   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
   */
  @Override
  public void getNext(CAS aCas) throws IOException, CollectionException {
    JCas jcas = null;
    try {
      jcas = aCas.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    String lineString = mBufferdReader.readLine().trim();
    String sentenceId = lineString.substring(0, lineString.indexOf(" "));
    String sentenceText = lineString.substring(lineString.indexOf(" "));

    jcas.setDocumentText(sentenceText);
    Sentence sentence = new Sentence(jcas);
    sentence.setSentenceId(sentenceId);
    sentence.addToIndexes();
  }
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    super.getNext(aCAS);

    JCas jcas;
    try {
      jcas = aCAS.getJCas();
      // consider a tweet to be a sentence
      Sentence sentenceAnno = new Sentence(jcas);
      sentenceAnno.setBegin(0);
      sentenceAnno.setEnd(jcas.getDocumentText().length());
      sentenceAnno.addToIndexes();
    } catch (CASException e) {
      throw new CollectionException();
    }

    TextClassificationOutcome outcome = new TextClassificationOutcome(jcas);
    outcome.setOutcome(getTextClassificationOutcome(jcas));
    outcome.addToIndexes();
  }
Пример #25
0
  /**
   * Processes the CAS which was populated by the TextAnalysisEngines. <br>
   * In this case, the CAS is converted to XMI and written into the output file .
   *
   * @param aCAS a CAS which has been populated by the TAEs
   * @throws ResourceProcessException if there is an error in processing the Resource
   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
   */
  public void processCas(CAS aCAS) throws ResourceProcessException {
    String modelFileName = null;

    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new ResourceProcessException(e);
    }

    // retrieve the filename of the input file from the CAS
    FSIterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    File outFile = null;
    if (it.hasNext()) {
      SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next();
      File inFile;
      try {
        inFile = new File(new URL(fileLoc.getUri()).getPath());
        String outFileName = inFile.getName();
        if (fileLoc.getOffsetInSource() > 0) {
          outFileName += ("_" + fileLoc.getOffsetInSource());
        }
        outFileName += ".xmi";
        outFile = new File(mOutputDir, outFileName);
        modelFileName = mOutputDir.getAbsolutePath() + "/" + inFile.getName() + ".ecore";
      } catch (MalformedURLException e1) {
        // invalid URL, use default processing below
      }
    }
    if (outFile == null) {
      outFile = new File(mOutputDir, "doc" + mDocNum++ + ".xmi");
    }
    // serialize XCAS and write to output file
    try {
      writeXmi(jcas.getCas(), outFile, modelFileName);
    } catch (IOException e) {
      throw new ResourceProcessException(e);
    } catch (SAXException e) {
      throw new ResourceProcessException(e);
    }
  }
Пример #26
0
  /** @see com.ibm.uima.collection.CollectionReader#getNext(com.ibm.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }
    String name = fileIterator.next().getAbsolutePath();
    jcas.setDocumentText(ReadWriteTextFileWithEncoding.read(name, "UTF-8"));
    numberOfFilesProcessed++;
    try {
      name = filenameToIDTranslator.cleanItUp(name);
    } catch (StringCleanerException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    StringArray s = new StringArray(jcas, 1);
    s.set(0, filenameToIDTranslator.getIdType() + name);
    ISI_UIMA_Util.setDocumentSecondaryIDs(jcas, s);
  }
Пример #27
0
	public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
		if (aStatus != null) {
			if (aStatus.isException()) {
				System.err.println("Error on process CAS call to remote service:");
				List<Exception> exceptions = aStatus.getExceptions();
				for (int i = 0; i < exceptions.size(); i++) {
					((Throwable) exceptions.get(i)).printStackTrace();
				}
			}
			
			try {
				JCas cas = aCas.getJCas();

				for(Token token : JCasUtil.select(cas, Token.class)) {
					System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue());
				}

			} catch (CASException e) {
				e.printStackTrace();
			}
		}
	}
Пример #28
0
  @Override
  protected void endRegion(final Style aStyle) throws IOException {
    if (log.isTraceEnabled()) {
      log.trace("</" + aStyle + ">");
    }

    if (regionText == null) {
      throw new IllegalStateException("No region started");
    }

    if (regionStyle != aStyle) {
      throw new IllegalStateException(
          "Current region has style " + regionStyle + ", but closing region has style " + aStyle);
    }

    // Append text
    int begin = text.length();
    sanitize(regionText);
    text.append(regionText.toString());
    int end = text.length();
    text.append('\n');

    // Add annotation
    switch (aStyle) {
      case HEADING:
        if (headingType != null) {
          Type t = cas.getTypeSystem().getType(headingType);
          AnnotationFS a = cas.createAnnotation(t, begin, end);
          cas.addFsToIndexes(a);
        }
        break;
      case PARAGRAPH:
        if (paragraphType != null) {
          Type t = cas.getTypeSystem().getType(paragraphType);
          AnnotationFS a = cas.createAnnotation(t, begin, end);
          cas.addFsToIndexes(a);
        }
        break;
      default:
        throw new IllegalStateException("Unknown region style: " + aStyle);
    }

    regionStyle = null;
    regionText = null;
  }
Пример #29
0
 @Override
 public void process(CAS cas) throws AnalysisEngineProcessException {
   JCas textJCas;
   try {
     textJCas = cas.getJCas();
     setStream(textJCas);
   } catch (CASException e1) {
     throw new AnalysisEngineProcessException(e1);
   } catch (final IOException e2) {
     throw new AnalysisEngineProcessException(e2);
   }
   final FSIterator<Annotation> annotationIt = SemanticAnnotation.getIterator(textJCas);
   while (annotationIt.hasNext()) {
     final SemanticAnnotation ann = (SemanticAnnotation) annotationIt.next();
     final String text =
         replaceNewlines
             ? StringUtils.join(' ', ann.getCoveredText().split(LINEBREAK))
             : ann.getCoveredText();
     try {
       write(ann.getNamespace());
       write(fieldSeparator);
       write(ann.getIdentifier());
       write(fieldSeparator);
       write(ann.getOffset().toString());
       write(fieldSeparator);
       write(text);
       write(LINEBREAK);
     } catch (final IOException e) {
       throw new AnalysisEngineProcessException(e);
     }
   }
   try {
     unsetStream();
   } catch (final IOException e) {
     throw new AnalysisEngineProcessException(e);
   }
 }
Пример #30
0
  /* (non-Javadoc)
   * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS)
   */
  public void process(CAS aCas) throws AnalysisEngineProcessException {

    this.logger.logrb(
        Level.INFO,
        "WhitespaceTokenizer",
        "process",
        MESSAGE_BUNDLE,
        "whitespace_tokenizer_info_start_processing");

    ArrayList<CAS> casList = new ArrayList<CAS>();
    // check if sofa names are available
    if (this.sofaNames != null && this.sofaNames.length > 0) {

      // get sofa names
      for (int i = 0; i < this.sofaNames.length; i++) {
        Iterator it = aCas.getViewIterator(this.sofaNames[i]);
        while (it.hasNext()) {
          // add sofas to the cas List to process
          casList.add((CAS) it.next());
        }
      }
    } else {
      // use default sofa for the processing
      casList.add(aCas);
    }

    for (int x = 0; x < casList.size(); x++) {

      this.cas = casList.get(x);

      // get text content from the CAS
      char[] textContent = this.cas.getDocumentText().toCharArray();

      int tokenStart = UNDEFINED;
      int currentCharPos = 0;
      int sentenceStart = 0;
      int nextCharType = UNDEFINED;
      char nextChar = INVALID_CHAR;

      while (currentCharPos < textContent.length) {
        char currentChar = textContent[currentCharPos];
        int currentCharType = getCharacterType(currentChar);

        // get character class for current and next character
        if ((currentCharPos + 1) < textContent.length) {
          nextChar = textContent[currentCharPos + 1];
          nextCharType = getCharacterType(nextChar);
        } else {
          nextCharType = UNDEFINED;
          nextChar = INVALID_CHAR;
        }

        // check if current character is a letter or number
        if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {

          // check if it is the first letter of a token
          if (tokenStart == UNDEFINED) {
            // start new token here
            tokenStart = currentCharPos;
          }
        }

        // check if current character is a whitespace character
        else if (currentCharType == CH_WHITESPACE) {

          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }
        }

        // check if current character is a special character
        else if (currentCharType == CH_SPECIAL) {

          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }

          // create token for special character
          createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
        }

        // check if current character is new line character
        else if (currentCharType == CH_NEWLINE) {
          // terminate current token
          if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }
        }

        // check if current character is new punctuation character
        else if (currentCharType == CH_PUNCTUATION) {

          // terminates the current token
          if (tokenStart != UNDEFINED) {
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
          }

          // check next token type so see if we have a sentence end
          if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
              && (punctuations.contains(new String(new char[] {currentChar})))) {
            // terminate sentence
            createAnnotation(this.sentenceType, sentenceStart, currentCharPos + 1);
            sentenceStart = currentCharPos + 1;
          }
          // create token for punctuation character
          createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
        }
        // go to the next token
        currentCharPos++;
      } // end of character loop

      // we are at the end of the text terminate open token annotations
      if (tokenStart != UNDEFINED) {
        // end of current word
        createAnnotation(this.tokenType, tokenStart, currentCharPos);
        tokenStart = UNDEFINED;
      }

      // we are at the end of the text terminate open sentence annotations
      if (sentenceStart != UNDEFINED) {
        // end of current word
        createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
        sentenceStart = UNDEFINED;
      }
    }
    this.logger.logrb(
        Level.INFO,
        "WhitespaceTokenizer",
        "process",
        MESSAGE_BUNDLE,
        "whitespace_tokenizer_info_stop_processing");
  }