/**
  * Returns average token length of chunks in a view
  *
  * @param view the view of the JCas
  * @return average token length of all chunks
  */
 private double getAverageNounPhraseTokenLength(JCas view) {
   int totalNumber = 0;
   for (Chunk chunk : JCasUtil.select(view, Chunk.class)) {
     totalNumber += JCasUtil.selectCovered(view, Token.class, chunk).size();
   }
   return totalNumber / (double) JCasUtil.select(view, Chunk.class).size();
 }
  @Override
  public Set<Feature> extract(JCas view, TextClassificationUnit classificationUnit)
      throws TextClassificationException {

    boolean isCompound = false;

    POS pos = JCasUtil.selectCovered(Token.class, classificationUnit).get(0).getPos();

    String word =
        JCasUtil.selectCovered(Lemma.class, classificationUnit).get(0).getValue().toLowerCase();

    // only check for noun compounds
    if (pos instanceof N) {
      try {
        isCompound = isCompound(word);
      } catch (ResourceInitializationException e) {
        throw new TextClassificationException(e);
      }
    }

    return new Feature(IS_COMPOUND, isCompound).asSet();
  }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {

    // Keeping track of the ranges of different relation candidates so we wont have duplicate
    // text snippets for different candidates on the same text
    Map<IndexRange, IndexRange> rangeMappings = new HashMap<IndexRange, IndexRange>();

    if (aggregateJCas == null) aggregateJCas = getEmptyJCas();

    CasCopier copier = new CasCopier(aJCas.getCas(), aggregateJCas.getCas());
    Iterator<RelationCandidate> iter = JCasUtil.iterator(aJCas, RelationCandidate.class);

    while (iter.hasNext()) {

      RelationCandidate candidate = iter.next();
      RelationCandidate candidateCopy = (RelationCandidate) copier.copyFs(candidate);

      // See if we already have this candidate in the aggregate jcas
      IndexRange candidateRange = new IndexRange(candidate);
      // The offset between the old jcas and the new of this relation candidate
      int offset = 0;

      if (rangeMappings.containsKey(candidateRange)) {
        offset = rangeMappings.get(candidateRange).getStart() - candidateRange.getStart();
        updateAnnotation(candidateCopy, offset);
        // No need to copy features, has already been done

      } else {
        offset = content.length() - candidateRange.getStart();
        updateAnnotation(candidateCopy, offset);
        rangeMappings.put(candidateRange, new IndexRange(candidateCopy));

        // For every feature we want to copy
        for (Class<? extends Annotation> feature : features) {

          // Iterating over the annotations of this feature type covered by this relation candidate
          for (Annotation annotation : JCasUtil.selectCovered(aJCas, feature, candidate)) {
            Annotation cAnnotation = (Annotation) copier.copyFs(annotation);
            // Updating the indices of the annotation
            updateAnnotation(cAnnotation, offset);
            aggregateJCas.addFsToIndexes(cAnnotation);
          }
        }
        // Adding the text content of the relation candidate to the new cas
        content.append(candidate.getCoveredText());
      }
      aggregateJCas.addFsToIndexes(candidateCopy);
    }
  }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    String documentId = DocumentMetaData.get(aJCas).getDocumentId();

    Class[] types = {Claim.class, Premise.class, Backing.class, Rebuttal.class, Refutation.class};
    for (Class type : types) {
      for (Object o : JCasUtil.select(aJCas, type)) {
        ArgumentComponent argumentComponent = (ArgumentComponent) o;

        // non-implicit components
        int end = argumentComponent.getEnd();
        int begin = argumentComponent.getBegin();
        if (end > begin) {
          List<Sentence> sentences =
              JCasUtil2.selectOverlapping(Sentence.class, argumentComponent, aJCas);

          String filename =
              documentId
                  + "_s"
                  + sentences.size()
                  + "_"
                  + argumentComponent.getClass().getSimpleName()
                  + "_"
                  + begin
                  + "_"
                  + end
                  + ".txt";

          StringBuilder sb = new StringBuilder();

          for (Sentence sentence : sentences) {
            List<String> tokens = new ArrayList<>();
            for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
              tokens.add(token.getCoveredText());
            }

            sb.append(StringUtils.join(tokens, " "));
            sb.append("\n");
          }

          try {
            FileUtils.write(new File(outputFolder, filename), sb.toString().trim());
          } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
          }
        }
      }
    }
  }
  @Test
  public void test() throws AnalysisEngineProcessException, ResourceInitializationException {

    final String text = "The fox jumps over the dog.";
    jCas.setDocumentText(text);

    processJCas();

    final Collection<Sentence> select = JCasUtil.select(jCas, Sentence.class);
    final Sentence s1 = select.iterator().next();

    final List<PhraseChunk> phrases = JCasUtil.selectCovered(jCas, PhraseChunk.class, s1);
    Assert.assertEquals(4, phrases.size());
    Assert.assertEquals("The fox", phrases.get(0).getCoveredText());
    Assert.assertEquals("jumps over the dog", phrases.get(1).getCoveredText());
    Assert.assertEquals("over the dog", phrases.get(2).getCoveredText());
    Assert.assertEquals("the dog", phrases.get(3).getCoveredText());
  }
Example #6
0
  @Test
  public void testProcess() throws AnalysisEngineProcessException, ResourceInitializationException {
    final String text = "The fox jumps over the dog.";
    jCas.setDocumentText(text);

    processJCas();

    final Collection<Sentence> select = JCasUtil.select(jCas, Sentence.class);
    final Sentence s1 = select.iterator().next();

    final List<Dependency> dependencies = JCasUtil.selectCovered(jCas, Dependency.class, s1);

    // We could test the output here, but its so model dependent its not
    // worth it, as long as annotations have been created"

    // 7 = 6 words + 1 punctuation, each should have a dependency
    assertEquals(7, dependencies.size());
  }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    CAS cas = aJCas.getCas();

    for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) {

      // If there is a constraint, check if it matches
      if (constraint != null) {
        JXPathContext ctx = JXPathContext.newContext(cover);
        boolean match = ctx.iterate(constraint).hasNext();
        if (!match) {
          continue;
        }
      }

      // If the target type is a token, use it directly, otherwise select the covered tokens
      Collection<Token> tokens;
      if (cover instanceof Token) {
        tokens = Collections.singleton((Token) cover);
      } else {
        tokens = JCasUtil.selectCovered(aJCas, Token.class, cover);
      }

      for (Token token : tokens) {
        try {
          String semanticField = semanticFieldResource.getSemanticTag(token);
          SemanticField semanticFieldAnnotation =
              new SemanticField(aJCas, token.getBegin(), token.getEnd());
          semanticFieldAnnotation.setValue(semanticField);
          semanticFieldAnnotation.addToIndexes();
        } catch (ResourceAccessException e) {
          throw new AnalysisEngineProcessException(e);
        }
      }
    }
  }
Example #8
0
  public static Tree createStanfordTree(Annotation root, TreeFactory tFact) {
    JCas aJCas;
    try {
      aJCas = root.getCAS().getJCas();
    } catch (CASException e) {
      throw new IllegalStateException("Unable to get JCas from JCas wrapper");
    }

    // define the new (root) node
    Tree rootNode;

    // before we can create a node, we must check if we have any children (we have to know
    // whether to create a node or a leaf - not very dynamic)
    if (root instanceof Constituent && !isLeaf((Constituent) root)) {
      Constituent node = (Constituent) root;
      List<Tree> childNodes = new ArrayList<Tree>();

      // get childNodes from child annotations
      FSArray children = node.getChildren();
      for (int i = 0; i < children.size(); i++) {
        childNodes.add(createStanfordTree(node.getChildren(i), tFact));
      }

      // now create the node with its children
      rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes);

    } else {
      // Handle leaf annotations
      // Leafs are always Token-annotations
      // We also have to insert a Preterminal node with the value of the
      // POS-Annotation on the token
      // because the POS is not directly stored within the treee
      Token wordAnnotation = (Token) root;

      // create leaf-node for the tree
      Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText());

      // create information about preceding and trailing whitespaces in the leaf node
      StringBuilder preWhitespaces = new StringBuilder();
      StringBuilder trailWhitespaces = new StringBuilder();

      List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1);
      List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1);

      if (precedingTokenList.size() > 0) {
        Token precedingToken = precedingTokenList.get(0);
        int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd();
        for (int i = 0; i < precedingWhitespaces; i++) {
          preWhitespaces.append(" ");
        }
      }
      if (followingTokenList.size() > 0) {
        Token followingToken = followingTokenList.get(0);
        int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd();
        for (int i = 0; i < trailingWhitespaces; i++) {
          trailWhitespaces.append(" ");
        }
      }

      // write whitespace information as CoreAnnotation.BeforeAnnotation and
      // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to
      // node label
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString());
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString());

      // get POS-annotation
      // get the token that is covered by the POS
      List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation);
      // the POS should only cover one token
      assert coveredPos.size() == 1;
      POS pos = coveredPos.get(0);

      // create POS-Node in the tree and attach word-node to it
      rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode})));
    }

    return rootNode;
  }
Example #9
0
  /**
   * Recreates a Stanford Tree from the StanfordParser annotations and saves all
   * non-StanfordParser-Annotations within the scope of the sentence in the label of the best
   * fitting node.
   *
   * <p><strong>CAUTION: </strong><i>This method is intended for the use by CAS Multipliers, which
   * create new CASes from this tree. The annotation-spans in the source-CAS will be changed!!!!!!
   * You do NOT want to use the source CAS after this method has been called. The
   * createStanfordTree()-method does not change the CAS, so use this instead, if the annotations do
   * not have to be recovered or accessed in the tree.</i>
   *
   * <p>TODO: This behavior could be changed by making COPIES of the annotations and changing the
   * copied instead of the originals. However, in order to being able to make copies, a dummy CAS
   * must be introduced to which the annotations can be copied. When they are recovered, they will
   * be copied to the new destination CAS anyway.
   *
   * @param root the ROOT annotation
   * @return an {@link Tree} object representing the syntax structure of the sentence
   * @throws CASException if the JCas cannot be accessed.
   */
  public static Tree createStanfordTreeWithAnnotations(Annotation root) throws CASException {
    JCas aJCas = root.getCAS().getJCas();

    // Create tree
    Tree tree = createStanfordTree(root);

    // Get all non-parser related annotations
    // and all tokens (needed for span-calculations later on)
    List<Annotation> nonParserAnnotations = new ArrayList<Annotation>();
    List<Token> tokens = new ArrayList<Token>();

    // Using getCoveredAnnotations instead of iterate, because subiterators did not work in all
    // cases
    List<Annotation> annosWithinRoot = JCasUtil.selectCovered(aJCas, Annotation.class, root);

    for (Annotation curAnno : annosWithinRoot) {
      if (!(curAnno instanceof POS)
          && !(curAnno instanceof Constituent)
          && !(curAnno instanceof Dependency)
          && !(curAnno instanceof PennTree)
          && !(curAnno instanceof Lemma)
          && !(curAnno instanceof Token)
          && !(curAnno instanceof DocumentMetaData)) {
        nonParserAnnotations.add(curAnno);
      } else if (curAnno instanceof Token) {
        tokens.add((Token) curAnno);
      }
    }

    // create wrapper for tree and its tokens
    TreeWithTokens annoTree = new TreeWithTokens(tree, tokens);

    /*
     * Add annotations to the best-fitting nodes. The best-fitting node for an annotation is the
     * deepest node in the tree that still completely contains the annotation.
     */
    for (Annotation curAnno : nonParserAnnotations) {
      // get best fitting node
      Tree bestFittingNode = annoTree.getBestFit(curAnno);

      // Add annotation to node
      if (bestFittingNode != null) {

        // translate annotation span to a value relative to the
        // node-span
        IntPair span = annoTree.getSpan(bestFittingNode);
        curAnno.setBegin(curAnno.getBegin() - span.getSource());
        curAnno.setEnd(curAnno.getEnd() - span.getSource());

        // get the collection from the label of the best-fitting node in which we store UIMA
        // annotations or create it, if it does not exist
        Collection<Annotation> annotations =
            ((CoreLabel) bestFittingNode.label()).get(UIMAAnnotations.class);
        if (annotations == null) {
          annotations = new ArrayList<Annotation>();
        }

        // add annotation + checksum of annotated text to list and write it back to node
        // label
        annotations.add(curAnno);

        ((CoreLabel) bestFittingNode.label()).set(UIMAAnnotations.class, annotations);
      }
    }

    return tree;
  }
Example #10
0
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {

    FSIndex answerIndex = aJCas.getAnnotationIndex(Answer.type);
    FSIndex questionIndex = aJCas.getAnnotationIndex(Question.type);
    FSIndex tokenIndex = aJCas.getAnnotationIndex(Token.type);

    Iterator answerIter = answerIndex.iterator();

    while (answerIter.hasNext()) {

      Answer answer = (Answer) answerIter.next();
      int beginAnswer = answer.getBegin();
      int endAnswer = answer.getEnd();

      List<Token> tokenAnswerList =
          JCasUtil.selectCovered(aJCas, Token.class, beginAnswer - 1, endAnswer);

      int listLen = tokenAnswerList.size();

      for (int i = 0; i < listLen - (n - 1); i++) {

        FSArray tokensArray = new FSArray(aJCas, n);

        for (int N = 0; N < n; N++) {
          tokensArray.set(N, (Token) tokenAnswerList.get(i + N));
        }

        Ngram ngram = new Ngram(aJCas);
        ngram.setBegin(tokenAnswerList.get(i).getBegin());
        ngram.setEnd(tokenAnswerList.get(i + (n - 1)).getEnd());

        //        Here we set the ngram's string value.

        String tempNgramToString = "";

        for (int j = 0; j < tokensArray.size(); j++) {
          tempNgramToString += (tokenAnswerList.get(j + i).getToStringValue() + " ");
        }

        String nGramToString = tempNgramToString.trim();

        ngram.setToStringValue(nGramToString);

        //        Here we set the value of n and also the tokens as an FSArray.

        ngram.setN(n);

        ngram.setTokens(tokensArray);

        ngram.addToIndexes();
      }
    }

    Iterator questionIter = questionIndex.iterator();

    while (questionIter.hasNext()) {

      Question question = (Question) questionIter.next();
      int beginQuestion = question.getBegin();
      int endQuestion = question.getEnd();

      List<Token> tokenQuestionList =
          JCasUtil.selectCovered(aJCas, Token.class, beginQuestion - 1, endQuestion);

      int listLen = tokenQuestionList.size();

      for (int i = 0; i < listLen - (n - 1); i++) {

        FSArray tokensArray = new FSArray(aJCas, n);

        for (int N = 0; N < n; N++) {
          tokensArray.set(N, tokenQuestionList.get(i + N));
        }

        Ngram ngram = new Ngram(aJCas);
        ngram.setBegin(tokenQuestionList.get(i).getBegin());
        ngram.setEnd(tokenQuestionList.get(i + (n - 1)).getEnd());

        //      Here we set the ngram's string value.

        String tempNgramToString = "";

        for (int j = 0; j < tokensArray.size(); j++) {
          tempNgramToString += (tokenQuestionList.get(j + i).getToStringValue() + " ");
        }

        String nGramToString = tempNgramToString.trim();

        ngram.setToStringValue(nGramToString);

        //      Here we set the value of n and also the tokens as an FSArray.

        ngram.setN(n);

        ngram.setTokens(tokensArray);

        ngram.addToIndexes();
      }
    }
  }
  public Set<Feature> extract(JCas jcas) {

    double nrOfNPs = 0.0;
    double nrOfVPs = 0.0;
    double nrOfPPs = 0.0;
    int nrOfSbars = 0;
    int nrOfVerbphrases = 0;
    int nrOfComplexNominals = 0;
    double nrOfClauses = 0.0;
    int nrOfDependentClauses = 0;
    double nrOfTunits = 0.0;
    int nrOfComplexTunits = 0;
    int nrOfCoords = 0;

    int lengthSumNPs = 0;
    int lengthSumVPs = 0;
    int lengthSumPPs = 0;
    int lengthSumClauses = 0;
    int lengthSumTunits = 0;
    int parseTreeDepthSum = 0;
    Set<Feature> featSet = new HashSet<Feature>();
    double nrOfSentences = JCasUtil.select(jcas, Sentence.class).size() * 1.0;
    for (Sentence s : JCasUtil.select(jcas, Sentence.class)) {
      parseTreeDepthSum += ParsePatternUtils.getParseDepth(s);
      for (Constituent c : JCasUtil.selectCovered(Constituent.class, s)) {
        if (c instanceof NP) {
          nrOfNPs++;
          lengthSumNPs += c.getCoveredText().length();
        } else if (c instanceof VP) {
          nrOfVPs++;
          lengthSumVPs += c.getCoveredText().length();
        } else if (c instanceof PP) {
          nrOfPPs++;
          lengthSumPPs += c.getCoveredText().length();
        } else if (c instanceof SBAR) {
          nrOfSbars++;
          if (ParsePatternUtils.isDependentClause(c)) {
            nrOfDependentClauses++;
          }

        } else if (ParsePatternUtils.isClause(c)) {
          nrOfClauses++;
          lengthSumClauses += c.getCoveredText().length();
        }

        if (ParsePatternUtils.isTunit(c)) {
          nrOfTunits++;
          lengthSumTunits += c.getCoveredText().length();
          if (ParsePatternUtils.isComplexTunit(c)) {
            nrOfComplexTunits++;
          }
        }
        if (ParsePatternUtils.isCoordinate(c)) {
          nrOfCoords++;
        }

        if (ParsePatternUtils.isComplexNominal(c)) {
          nrOfComplexNominals++;
        }
        if (ParsePatternUtils.isVerbPhrase(c)) {
          nrOfVerbphrases++;
        }
      }
    }

    // avoid division by zero, there should be at least one sentence in the cas
    nrOfSentences = Math.max(1, nrOfSentences);

    featSet.addAll(Arrays.asList(new Feature(NPS_PER_SENTENCE, nrOfNPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(VPS_PER_SENTENCE, nrOfVPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(PPS_PER_SENTENCE, nrOfPPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(SBARS_PER_SENTENCE, nrOfSbars / nrOfSentences)));

    featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_SENTENCE, nrOfClauses / nrOfSentences)));
    featSet.addAll(
        Arrays.asList(new Feature(DEP_CLAUSES_PER_SENTENCE, nrOfDependentClauses / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(TUNITS_PER_SENTENCE, nrOfTunits / nrOfSentences)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEX_TUNITS_PER_SENTENCE, nrOfComplexTunits / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_SENTENCE, nrOfCoords / nrOfSentences)));

    // avoid division by 0,
    // if we don't have any NPs, the lengthSum is 0, division by 1 will yield 0 as average
    // length
    nrOfNPs = Math.max(1, nrOfNPs);
    nrOfVPs = Math.max(1, nrOfVPs);
    nrOfPPs = Math.max(1, nrOfPPs);
    nrOfTunits = Math.max(1, nrOfTunits);

    featSet.addAll(Arrays.asList(new Feature(AVG_NP_LENGTH, lengthSumNPs / nrOfNPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_VP_LENGTH, lengthSumVPs / nrOfVPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_PP_LENGTH, lengthSumPPs / nrOfPPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_TUNIT_LENGTH, lengthSumTunits / nrOfTunits)));

    featSet.addAll(Arrays.asList(new Feature(AVG_TREE_DEPTH, parseTreeDepthSum / nrOfSentences)));

    featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_TUNIT, nrOfClauses / nrOfTunits)));

    nrOfClauses = Math.max(1, nrOfClauses);
    featSet.addAll(Arrays.asList(new Feature(AVG_CLAUSE_LENGTH, lengthSumClauses / nrOfClauses)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEX_TUNITS_PER_TUNIT, nrOfComplexTunits / nrOfTunits)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_TUNIT, nrOfCoords / nrOfTunits)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEXNOMINALS_PER_TUNIT, nrOfComplexNominals / nrOfTunits)));
    featSet.addAll(Arrays.asList(new Feature(VERBPHRASES_PER_TUNIT, nrOfVerbphrases / nrOfTunits)));
    featSet.addAll(
        Arrays.asList(new Feature(DEPCLAUSE_TUNIT_RATIO, nrOfDependentClauses / nrOfTunits)));
    ;

    featSet.addAll(
        Arrays.asList(new Feature(DEPCLAUSE_CLAUSE_RATIO, nrOfDependentClauses / nrOfClauses)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_CLAUSE, nrOfCoords / nrOfClauses)));
    ;
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEXNOMINALS_PER_CLAUSE, nrOfComplexNominals / nrOfClauses)));
    ;
    return featSet;
  }