public void add(JCas jcas, Set<String> inputPaths, Type sentenceType) throws IOException {
    ConditionalFrequencyDistribution<Integer, String> cfd =
        new ConditionalFrequencyDistribution<Integer, String>();

    CAS cas = jcas.getCas();

    for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) {

      for (String path : inputPaths) {

        String[] segments = path.split("/", 2);
        String typeName = segments[0];

        Type type = getInputType(cas, typeName);

        List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation);

        List<String> tokenStrings;
        try {
          tokenStrings = createStringList(tokens, segments);
        } catch (AnalysisEngineProcessException e) {
          throw new IOException(e);
        }

        for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) {
          cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength, ngramLength));
        }
      }
    }

    add(cfd);
  }
 /** Create a new link annotation. Already adds the chain to the CAS. */
 private AnnotationFS newLink(
     JCas aJCas, int aBegin, int aEnd, AnnotationFeature aFeature, String aLabelValue) {
   String baseName = StringUtils.substringBeforeLast(getAnnotationTypeName(), CHAIN) + LINK;
   Type linkType = CasUtil.getType(aJCas.getCas(), baseName);
   AnnotationFS newLink = aJCas.getCas().createAnnotation(linkType, aBegin, aEnd);
   BratAjaxCasUtil.setFeature(newLink, aFeature, aLabelValue);
   aJCas.getCas().addFsToIndexes(newLink);
   return newLink;
 }
  /**
   * Generate an instance from the text covered by the given annotation.
   *
   * @param annotation an annotation representing a document segment, e.g. {@link Sentence}.
   * @param tokenType the type to use for representing tokens, usually {@link Token}, but could also
   *     be any other type.
   * @return
   */
  private TokenSequence generateTokenSequence(AnnotationFS annotation, Type tokenType) {
    TokenSequence tokenSequence = new TokenSequence();

    for (AnnotationFS token : CasUtil.selectCovered(tokenType, annotation)) {
      for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) {
        tokenSequence.add(tokenText);
      }
    }

    return tokenSequence;
  }
 /**
  * Generate a TokenSequence from the whole document.
  *
  * @param aJCas a CAS holding the document
  * @param tokenType this type will be used as token, e.g. Token, N-gram etc.
  * @param useLemma if this is true, use lemmas
  * @param minTokenLength the minimum token length to use
  * @return a {@link TokenSequence}
  * @throws FeaturePathException if the annotation type specified in {@code PARAM_TYPE_NAME} cannot
  *     be extracted.
  */
 protected static TokenSequence generateTokenSequence(
     JCas aJCas, Type tokenType, boolean useLemma, int minTokenLength)
     throws FeaturePathException {
   TokenSequence tokenSequence = new TokenSequence();
   for (AnnotationFS token : CasUtil.select(aJCas.getCas(), tokenType)) {
     for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) {
       tokenSequence.add(tokenText);
     }
   }
   return tokenSequence;
 }
  /**
   * Generate one or multiple TokenSequences from the given document. If {@code
   * PARAM_MODEL_ENTITY_TYPE} is set, an instance is generated from each segment annotated with the
   * given type. Otherwise, one instance is generated from the whole document.
   *
   * @param aJCas
   * @return
   * @throws FeaturePathException
   */
  protected Collection<TokenSequence> generateTokenSequences(JCas aJCas)
      throws FeaturePathException {
    Collection<TokenSequence> tokenSequences;
    CAS cas = aJCas.getCas();
    Type tokenType = CasUtil.getType(cas, typeName);

    if (modelEntityType == null) {
      /* generate only one tokenSequence */
      tokenSequences = new ArrayList<>(1);
      tokenSequences.add(generateTokenSequence(aJCas, tokenType, useLemma, minTokenLength));
    } else {
      /* generate tokenSequences for every segment (e.g. sentence) */
      tokenSequences = new ArrayList<>();
      for (AnnotationFS segment : CasUtil.select(cas, CasUtil.getType(cas, modelEntityType))) {
        tokenSequences.add(generateTokenSequence(segment, tokenType));
      }
    }

    return tokenSequences;
  }
  @Test
  public void testGetDocumentCas()
      throws ResourceInitializationException, IOException, SAXException, URISyntaxException,
          ParserConfigurationException {
    CAS aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertEquals(6, CasUtil.selectAll(aCAS).size());
    assertEquals(
        1,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());

    aCAS =
        CasCreationUtils.createCas(
            XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null);
    corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS);
    assertThat(aCAS.getDocumentText(), containsString("РИА Новости"));
    assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5));
    assertEquals(
        0,
        CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon"))
            .size());
  }
Beispiel #7
0
  /**
   * Returns the (one) annotation of a given type that is aligned with another annotation.
   *
   * @param annotation An annotation.
   * @return The annotation aligned with another annotation.
   */
  private AnnotationFS getAnnotation(Type type, AnnotationFS annotation) {
    List<AnnotationFS> annotations = CasUtil.selectCovered(annotation.getCAS(), type, annotation);
    if (annotations.size() != 1) {
      getLogger()
          .warn(
              "Could not find matching annotation of type "
                  + type
                  + " for annotation: "
                  + annotation.getCoveredText());
      return null;
    }

    return annotations.get(0);
  }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    CAS cas = aJCas.getCas();

    for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) {

      // If there is a constraint, check if it matches
      if (constraint != null) {
        JXPathContext ctx = JXPathContext.newContext(cover);
        boolean match = ctx.iterate(constraint).hasNext();
        if (!match) {
          continue;
        }
      }

      // If the target type is a token, use it directly, otherwise select the covered tokens
      Collection<Token> tokens;
      if (cover instanceof Token) {
        tokens = Collections.singleton((Token) cover);
      } else {
        tokens = JCasUtil.selectCovered(aJCas, Token.class, cover);
      }

      for (Token token : tokens) {
        try {
          String semanticField = semanticFieldResource.getSemanticTag(token);
          SemanticField semanticFieldAnnotation =
              new SemanticField(aJCas, token.getBegin(), token.getEnd());
          semanticFieldAnnotation.setValue(semanticField);
          semanticFieldAnnotation.addToIndexes();
        } catch (ResourceAccessException e) {
          throw new AnalysisEngineProcessException(e);
        }
      }
    }
  }
 @Override
 public Type getAnnotationType(CAS cas) {
   return CasUtil.getType(cas, annotationTypeName);
 }