/**
  * Extracts the number (sg/pl) from the Token. To be applied only on nouns / pronouns.
  *
  * @param token
  * @return
  */
 public String getNumber(Token token) {
   String pos = token.getPos().getPosValue();
   if (!isNounOrPronoun(token)) {
     System.err.println(
         "Use method only for nouns / pronouns. " + pos + " " + token.getCoveredText());
     // throw new IllegalArgumentException();
     return "unknown"; // occurs e.g. for 'there' (existential)
   }
   if (pos.matches("NNP?S")) {
     return "pl";
   }
   if (pos.matches("NNP?")) {
     return "sg";
   }
   if (pos.matches("PRP\\$?|CD")) {
     String lemma = token.getLemma().getValue().toLowerCase();
     if (lemma.matches(
         "I|me|myself|he|him|himself|she|her|herself|it|itself|one|onself|mine|thine|his|hers")) {
       return "sg";
     }
     if (lemma.matches(
         "we|us|ourselves|ourself|yourselves|they|them|themselves|theirselves|theirs|ours")) {
       return "pl";
     }
   }
   return "unknown";
 }
  /**
   * Add an alignment link from T to H, based on the rule t->h in which t is a phrase in T from
   * index textStart to textEnd of the tokens, and h is a phrase in H from index hypoStart to
   * hypoEnd of the tokens,
   *
   * @param textToken Token in TextView to annotate
   * @param hypoToken Token in HypoView to annotate
   * @param confidence The confidence of the rule
   * @param linkDirection The direction of the link (t to h, h to t or bidirectional).
   * @param linkInfo The relation of the rule (Wordnet synonym, Wikipedia redirect etc).
   * @param linkGroupLabel
   * @throws CASException
   */
  private void addAlignmentAnnotations(
      Token textToken,
      Token hypoToken,
      double confidence,
      Direction linkDirection,
      String linkInfo,
      StringList linkGroupLabel)
      throws CASException {

    // Prepare the Target instances
    Target textTarget = new Target(textView);
    Target hypoTarget = new Target(hypoView);

    // Prepare an FSArray instance and put the target annotations in it
    FSArray textAnnots = new FSArray(textView, 1);
    FSArray hypoAnnots = new FSArray(hypoView, 1);

    textAnnots.set(0, textToken);
    hypoAnnots.set(0, hypoToken);

    textTarget.setTargetAnnotations(textAnnots);
    hypoTarget.setTargetAnnotations(hypoAnnots);

    // Set begin and end value of the Target annotations
    textTarget.setBegin(textToken.getBegin());
    textTarget.setEnd(textToken.getEnd());
    hypoTarget.setBegin(hypoToken.getBegin());
    hypoTarget.setEnd(hypoToken.getEnd());

    // Add the targets to the indices
    textTarget.addToIndexes();
    hypoTarget.addToIndexes();

    // Mark an alignment.Link and add it to the hypothesis view
    Link link = new Link(hypoView);
    link.setTSideTarget(textTarget);
    link.setHSideTarget(hypoTarget);

    // Set the link direction
    link.setDirection(linkDirection);

    // Set strength
    link.setStrength(confidence);

    // Set Group label
    link.setGroupLabel(linkGroupLabel);

    // Add the link information
    link.setAlignerID(ALIGNER_ID);
    link.setAlignerVersion(ALIGNER_VERSION);
    link.setLinkInfo(linkInfo);

    // Mark begin and end according to the hypothesis target
    link.setBegin(hypoTarget.getBegin());
    link.setEnd(hypoTarget.getEnd());

    // Add to index
    link.addToIndexes();
  }
 public static boolean isNounOrPronoun(Token token) {
   String pos = token.getPos().getPosValue();
   // JJ: allows things like "British" / "Australian" which are marked as
   // NEs in ACE
   if (!(pos.startsWith("N")
       || pos.matches("PRP\\$?|CD|JJS?")
       || pos.matches("DT|WHNP|WP|PRP$?")
       || (pos.matches("WDT|WP") && token.getLemma().getValue().matches("who|which|that")))) {
     return false;
   }
   return true;
 }
Ejemplo n.º 4
0
 protected Token createToken(
     final JCas aJCas, final int aBegin, final int aEnd, final int aIndex) {
   int[] span = new int[] {aBegin, aEnd};
   trim(aJCas.getDocumentText(), span);
   if (!isEmpty(span[0], span[1]) && isWriteToken()) {
     Token seg = new Token(aJCas, span[0], span[1]);
     seg.addToIndexes(aJCas);
     return seg;
   } else {
     return null;
   }
 }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    String documentId = DocumentMetaData.get(aJCas).getDocumentId();

    Class[] types = {Claim.class, Premise.class, Backing.class, Rebuttal.class, Refutation.class};
    for (Class type : types) {
      for (Object o : JCasUtil.select(aJCas, type)) {
        ArgumentComponent argumentComponent = (ArgumentComponent) o;

        // non-implicit components
        int end = argumentComponent.getEnd();
        int begin = argumentComponent.getBegin();
        if (end > begin) {
          List<Sentence> sentences =
              JCasUtil2.selectOverlapping(Sentence.class, argumentComponent, aJCas);

          String filename =
              documentId
                  + "_s"
                  + sentences.size()
                  + "_"
                  + argumentComponent.getClass().getSimpleName()
                  + "_"
                  + begin
                  + "_"
                  + end
                  + ".txt";

          StringBuilder sb = new StringBuilder();

          for (Sentence sentence : sentences) {
            List<String> tokens = new ArrayList<>();
            for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
              tokens.add(token.getCoveredText());
            }

            sb.append(StringUtils.join(tokens, " "));
            sb.append("\n");
          }

          try {
            FileUtils.write(new File(outputFolder, filename), sb.toString().trim());
          } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
          }
        }
      }
    }
  }
  /**
   * Returns countability information according to Celex database of English nouns.
   *
   * @param token
   * @return
   */
  public String getCountability(Token token) {
    if (!USE_CELEX) {
      System.err.println(
          "This should never happen, don't call this function if you did not configure to use Celex!");
      throw new IllegalStateException();
    }

    if (!token.getPos().getPosValue().startsWith("N")) {
      return "NO-NOUN";
    }
    if (countability.containsKey(token.getLemma().getValue().toLowerCase())) {
      return countability.get(token.getLemma().getValue().toLowerCase());
    } else {
      return "none";
    }
  }
Ejemplo n.º 7
0
  @Override
  public List<Feature> extract(JCas jcas) throws TextClassificationException {

    double nbToken = 0;
    double minToken = -1; // Sizes in letter
    double maxToken = 0;
    double meanToken = 0;
    for (Token token : JCasUtil.select(jcas, Token.class)) {
      nbToken++;
      if (minToken < 0) {
        minToken = token.getCoveredText().length(); // gets the size value of the first
        // token
      }
      if (minToken > token.getCoveredText().length()) {
        minToken = token.getCoveredText().length();
      }
      if (maxToken < token.getCoveredText().length()) {
        maxToken = token.getCoveredText().length();
      }

      meanToken += token.getCoveredText().length();
    }
    try {
      meanToken /= nbToken;
    } catch (Exception e) {
      meanToken = 0;
    }

    List<Feature> featList = new ArrayList<Feature>();
    featList.addAll(Arrays.asList(new Feature("nb_" + TOKEN, nbToken)));
    featList.addAll(Arrays.asList(new Feature("max_" + TOKEN + "_size", maxToken)));
    featList.addAll(Arrays.asList(new Feature("min_" + TOKEN + "_size", minToken)));
    featList.addAll(Arrays.asList(new Feature("mean_" + TOKEN + "_size", meanToken)));
    return featList;
  }
 /**
  * Extracts person from Token. To be applied only on nouns / pronouns.
  *
  * @param token
  * @return
  */
 public String getPerson(Token token) {
   if (!isNounOrPronoun(token)) {
     if (token.getPos().getPosValue().equals("EX")) {
       return "3"; // existential 'there'
     } else {
       System.err.println("Use getPerson method only for nouns / pronouns.");
       throw new IllegalArgumentException();
     }
   }
   String lemma = token.getLemma().getValue().toLowerCase();
   String person = "3";
   if (lemma.matches("i|we|me|us|myself|ourselves|ourself")) {
     person = "1";
   } else if (lemma.matches("you|ye|thou|thee|yourself|thyself|yourselves|yourself")) {
     person = "2";
   }
   return person;
 }
 /**
  * Extracts the noun type from the POS tag. Returns proper/common/pronoun.
  *
  * @param token
  * @return
  */
 public static String getNounType(Token token) {
   if (!isNounOrPronoun(token)) {
     if (token.getPos().getPosValue().equals("EX")) {
       return "unknown"; // existential 'there'
     } else {
       System.err.println("Use getPerson method only for nouns / pronouns.");
       throw new IllegalArgumentException();
     }
   }
   if (token.getPos().getPosValue().matches("NNPS?")) {
     return "proper";
   }
   if (token.getPos().getPosValue().matches("NNS?")) {
     return "common";
   }
   if (isPronoun(token.getPos().getPosValue(), token.getLemma().getValue())) {
     return "pronoun";
   }
   return "unknown";
 }
Ejemplo n.º 10
0
	public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
		if (aStatus != null) {
			if (aStatus.isException()) {
				System.err.println("Error on process CAS call to remote service:");
				List<Exception> exceptions = aStatus.getExceptions();
				for (int i = 0; i < exceptions.size(); i++) {
					((Throwable) exceptions.get(i)).printStackTrace();
				}
			}
			
			try {
				JCas cas = aCas.getJCas();

				for(Token token : JCasUtil.select(cas, Token.class)) {
					System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue());
				}

			} catch (CASException e) {
				e.printStackTrace();
			}
		}
	}
Ejemplo n.º 11
0
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    CAS cas = aJCas.getCas();

    for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) {

      // If there is a constraint, check if it matches
      if (constraint != null) {
        JXPathContext ctx = JXPathContext.newContext(cover);
        boolean match = ctx.iterate(constraint).hasNext();
        if (!match) {
          continue;
        }
      }

      // If the target type is a token, use it directly, otherwise select the covered tokens
      Collection<Token> tokens;
      if (cover instanceof Token) {
        tokens = Collections.singleton((Token) cover);
      } else {
        tokens = JCasUtil.selectCovered(aJCas, Token.class, cover);
      }

      for (Token token : tokens) {
        try {
          String semanticField = semanticFieldResource.getSemanticTag(token);
          SemanticField semanticFieldAnnotation =
              new SemanticField(aJCas, token.getBegin(), token.getEnd());
          semanticFieldAnnotation.setValue(semanticField);
          semanticFieldAnnotation.addToIndexes();
        } catch (ResourceAccessException e) {
          throw new AnalysisEngineProcessException(e);
        }
      }
    }
  }
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    buf = new StringBuilder();
    List<Token> toAdd = new ArrayList<Token>();
    List<Token> toRemove = new ArrayList<Token>();

    for (Token t : select(aJCas, Token.class)) {
      String text = t.getCoveredText();
      int offset = t.getBegin();
      int start = 0;
      SplitPattern lastPattern = getPattern(text.charAt(0), null);
      Token firstToken = null;
      for (int i = 1; i < text.length(); i++) {
        SplitPattern pattern = getPattern(text.charAt(i), lastPattern);
        if (pattern != lastPattern) {
          if (lastPattern == null || lastPattern.includeInOutput) {
            Token nt = addToken(aJCas, offset, text, start, i, toAdd);
            firstToken = (firstToken == null) ? nt : firstToken;
          }
          start = i;
        }
        lastPattern = pattern;
      }

      // If we would just create the same token again, better do nothing
      if (start == 0) {
        // That is - if the whole token matches something to exclude, we remove it
        if (lastPattern != null && !lastPattern.includeInOutput) {
          toRemove.add(t);
        }
        continue;
      }

      if (deleteCover) {
        toRemove.add(t);
      }

      // The rest goes into the final token
      if (lastPattern == null || lastPattern.includeInOutput) {
        addToken(aJCas, offset, text, start, text.length(), toAdd);
      }
    }

    for (Token t : toAdd) {
      t.addToIndexes();
    }

    for (Token t : toRemove) {
      t.removeFromIndexes();
    }
  }
  /**
   * Returns true if the Token is a bare plural (definition by Reiter: excludes the quantified cases
   * -- different from Suh!!).
   *
   * @param jCas
   * @param token
   * @return
   */
  public static Boolean isBarePlural(
      JCas jCas, Token token, HashMap<Token, Set<Dependency>> childNodeMap) {
    // is it a plural?
    String pos = token.getPos().getPosValue();
    if (!pos.matches("NNP?S")) {
      return false;
    }

    if (!childNodeMap.containsKey(token)) {
      return true;
    }
    for (Dependency dep : childNodeMap.get(token)) {
      if (dep.getGovernor() == token && dep.getDependencyType().matches("det|poss")) {
        return false;
      }
    }
    return true;
  }
Ejemplo n.º 14
0
  public static Tree createStanfordTree(Annotation root, TreeFactory tFact) {
    JCas aJCas;
    try {
      aJCas = root.getCAS().getJCas();
    } catch (CASException e) {
      throw new IllegalStateException("Unable to get JCas from JCas wrapper");
    }

    // define the new (root) node
    Tree rootNode;

    // before we can create a node, we must check if we have any children (we have to know
    // whether to create a node or a leaf - not very dynamic)
    if (root instanceof Constituent && !isLeaf((Constituent) root)) {
      Constituent node = (Constituent) root;
      List<Tree> childNodes = new ArrayList<Tree>();

      // get childNodes from child annotations
      FSArray children = node.getChildren();
      for (int i = 0; i < children.size(); i++) {
        childNodes.add(createStanfordTree(node.getChildren(i), tFact));
      }

      // now create the node with its children
      rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes);

    } else {
      // Handle leaf annotations
      // Leafs are always Token-annotations
      // We also have to insert a Preterminal node with the value of the
      // POS-Annotation on the token
      // because the POS is not directly stored within the treee
      Token wordAnnotation = (Token) root;

      // create leaf-node for the tree
      Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText());

      // create information about preceding and trailing whitespaces in the leaf node
      StringBuilder preWhitespaces = new StringBuilder();
      StringBuilder trailWhitespaces = new StringBuilder();

      List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1);
      List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1);

      if (precedingTokenList.size() > 0) {
        Token precedingToken = precedingTokenList.get(0);
        int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd();
        for (int i = 0; i < precedingWhitespaces; i++) {
          preWhitespaces.append(" ");
        }
      }
      if (followingTokenList.size() > 0) {
        Token followingToken = followingTokenList.get(0);
        int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd();
        for (int i = 0; i < trailingWhitespaces; i++) {
          trailWhitespaces.append(" ");
        }
      }

      // write whitespace information as CoreAnnotation.BeforeAnnotation and
      // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to
      // node label
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString());
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString());

      // get POS-annotation
      // get the token that is covered by the POS
      List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation);
      // the POS should only cover one token
      assert coveredPos.size() == 1;
      POS pos = coveredPos.get(0);

      // create POS-Node in the tree and attach word-node to it
      rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode})));
    }

    return rootNode;
  }
Ejemplo n.º 15
0
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    try {
      if (readPos) {
        posMappingProvider.configure(aJCas.getCas());
      }

      if (readConstituent) {
        constituentMappingProvider.configure(aJCas.getCas());
      }
    } catch (AnalysisEngineProcessException e) {
      throw new IOException(e);
    }

    Map<String, CoreferenceLink> chains = new HashMap<>();

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aJCas, aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      StringBuilder parse = new StringBuilder();

      // Tokens, Lemma, POS
      Map<Integer, Token> tokenById = new HashMap<Integer, Token>();
      List<SemPred> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokenById.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        if (!UNUSED.equals(word[PARSE]) && readConstituent) {
          String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")");
          parse.append(fixed);
        }

        if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) {
          WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd());
          wordSense.setValue(word[WORD_SENSE]);
          wordSense.addToIndexes();
        }

        if (!UNUSED.equals(word[word.length - 1]) && readCoreference) {
          String[] chainFragments = word[word.length - 1].split("\\|");
          for (String chainFragment : chainFragments) {
            boolean beginning = chainFragment.startsWith("(");
            boolean ending = chainFragment.endsWith(")");

            String chainId =
                chainFragment.substring(
                    beginning ? 1 : 0,
                    ending ? chainFragment.length() - 1 : chainFragment.length());

            CoreferenceLink link = chains.get(chainId);
            if (beginning) {
              if (link == null) {
                link = new CoreferenceLink(aJCas);
                CoreferenceChain chain = new CoreferenceChain(aJCas);
                chain.setFirst(link);
                chain.addToIndexes();
              } else {
                CoreferenceLink newLink = new CoreferenceLink(aJCas);
                link.setNext(newLink);
                link = newLink;
              }
              link.setReferenceType(chainId);
              link.setBegin(token.getBegin());
            }

            if (ending) {
              link.setEnd(token.getEnd());
              link.addToIndexes();
            }

            chains.put(chainId, link);
          }
        }

        sentenceEnd = token.getEnd();
      }

      // Named entities
      if (readNamedEntity) {
        int currentNeBegin = -1;
        String currentNeType = null;
        for (int i = 0; i < words.size(); i++) {
          String ne = words.get(i)[NAMED_ENTITIES];
          boolean beginning = ne.startsWith("(");
          boolean ending = ne.endsWith(")");

          // When a NE is beginning, we remember what the NE is and where it began
          if (beginning) {
            // The NE is beginning with "(" and either ending with "(" or "*", so we trim
            // the first and last character
            currentNeType = ne.substring(1, ne.length() - 1);
            currentNeBegin = i;
          }

          // We need to create an annotation if the current token is the end of an annotation
          if (ending) {
            // Determine begin and end of named entity
            int begin = tokenById.get(currentNeBegin).getBegin();
            int end = tokenById.get(i).getEnd();

            // Add named entity
            NamedEntity namedEntity = new NamedEntity(aJCas, begin, end);
            namedEntity.setValue(currentNeType);
            namedEntity.addToIndexes();

            // Forget remembered named entity
            currentNeBegin = -1;
            currentNeType = null;
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          SemPred pred = preds.get(p);
          List<SemArgLink> args = new ArrayList<>();

          int currentArgBegin = -1;
          String currentArgType = null;
          for (int i = 0; i < words.size(); i++) {
            String ne = words.get(i)[APRED + p];
            boolean beginning = ne.startsWith("(");
            boolean ending = ne.endsWith(")");

            // When a arg is beginning, we remember what the NE is and where it began
            if (beginning) {
              // The arg is beginning with "(" and either ending with "(" or "*", so
              // we trim the first and last character
              currentArgType = ne.substring(1, ne.length() - 1);
              currentArgBegin = i;
            }

            // We need to create an annotation if the current token is the end of an
            // annotation
            if (ending) {
              // Determine begin and end of argument
              int begin = tokenById.get(currentArgBegin).getBegin();
              int end = tokenById.get(i).getEnd();

              // Add named entity unless it is a (V*) which has the same offsets as
              // the predicate
              if (!(pred.getBegin() == begin && pred.getEnd() == end)) {
                SemArg arg = new SemArg(aJCas, begin, end);
                arg.addToIndexes();

                SemArgLink link = new SemArgLink(aJCas);
                link.setRole(currentArgType);
                link.setTarget(arg);
                args.add(link);
              }

              // Forget remembered arg
              currentArgBegin = -1;
              currentArgType = null;
            }
          }

          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString()));

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
Ejemplo n.º 16
0
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
      try {
        posMappingProvider.configure(aJCas.getCas());
      } catch (AnalysisEngineProcessException e) {
        throw new IOException(e);
      }
    }

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      // Tokens, Lemma, POS
      Map<Integer, Token> tokens = new HashMap<Integer, Token>();
      List<SemanticPredicate> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokens.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        // Read morphological features
        if (!UNUSED.equals(word[FEAT]) && readMorph) {
          MorphologicalFeatures morphtag =
              new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
          morphtag.setValue(word[FEAT]);
          morphtag.addToIndexes();
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        sentenceEnd = token.getEnd();
      }

      // Dependencies
      if (readDependency) {
        for (String[] word : words) {
          if (!UNUSED.equals(word[DEPREL])) {
            int depId = Integer.valueOf(word[ID]);
            int govId = Integer.valueOf(word[HEAD]);

            // Model the root as a loop onto itself
            if (govId == 0) {
              Dependency rel = new ROOT(aJCas);
              rel.setGovernor(tokens.get(depId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            } else {
              Dependency rel = new Dependency(aJCas);
              rel.setGovernor(tokens.get(govId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            }
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          List<SemanticArgument> args = new ArrayList<SemanticArgument>();
          for (String[] word : words) {
            if (!UNUSED.equals(word[APRED + p])) {
              Token token = tokens.get(Integer.valueOf(word[ID]));
              SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd());
              arg.setRole(word[APRED + p]);
              arg.addToIndexes();
              args.add(arg);
            }
          }
          SemanticPredicate pred = preds.get(p);
          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
Ejemplo n.º 17
0
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    // Convert UIMA to LIF Container
    Container container = new Container();
    container.setLanguage(aJCas.getDocumentLanguage());
    container.setText(aJCas.getDocumentText());

    View view = container.newView();

    // Paragraph
    for (Paragraph p : select(aJCas, Paragraph.class)) {
      view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd());
    }

    // Sentence
    for (Sentence s : select(aJCas, Sentence.class)) {
      view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd());
    }

    // Token, POS, Lemma
    for (Token t : select(aJCas, Token.class)) {
      Annotation a =
          view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd());
      if (t.getPos() != null) {
        a.addFeature(Features.Token.POS, t.getPos().getPosValue());
      }

      if (t.getLemma() != null) {
        a.addFeature(Features.Token.LEMMA, t.getLemma().getValue());
      }
    }

    // NamedEntity
    for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) {
      Annotation ne =
          view.newAnnotation(
              id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd());
      ne.setLabel(neAnno.getValue());
    }

    // Dependency
    for (Sentence s : select(aJCas, Sentence.class)) {
      Set<String> depRelIds = new TreeSet<>();

      for (Dependency dep : selectCovered(Dependency.class, s)) {
        String depRelId = id(DEPENDENCY, dep);
        // LAPPS dependencies inherit from Relation which has no offsets
        Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY);
        depRel.setLabel(dep.getDependencyType());
        depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor()));
        depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent()));
        depRelIds.add(depRelId);
      }

      if (!depRelIds.isEmpty()) {
        Annotation depStruct =
            view.newAnnotation(
                id(DEPENDENCY_STRUCTURE, s),
                Discriminators.Uri.DEPENDENCY_STRUCTURE,
                s.getBegin(),
                s.getEnd());
        depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds);
      }
    }

    // Constituents
    for (ROOT r : select(aJCas, ROOT.class)) {
      Set<String> constituents = new LinkedHashSet<>();
      convertConstituent(view, r, constituents);

      Annotation phraseStruct =
          view.newAnnotation(
              id(PHRASE_STRUCTURE, r),
              Discriminators.Uri.PHRASE_STRUCTURE,
              r.getBegin(),
              r.getEnd());
      phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents);
    }

    try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
      String json = Serializer.toPrettyJson(container);
      IOUtils.write(json, docOS, encoding);
    } catch (Exception e) {
      throw new AnalysisEngineProcessException(e);
    }
  }
  private void verifyToken(Token token, TestTokenInfo info) throws LAPVerificationException {
    if (!info.text.equals(token.getCoveredText()))
      throw new LAPVerificationException(
          "Bad token text for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.text
              + "\", got \""
              + token.getCoveredText()
              + "\"");
    if (info.begin != token.getBegin())
      throw new LAPVerificationException(
          "Bad token begin index for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.begin
              + ", got "
              + token.getBegin());
    if (info.end != token.getEnd())
      throw new LAPVerificationException(
          "Bad token end index for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.end
              + ", got "
              + token.getEnd());
    if (!info.lemma.equals(token.getLemma().getValue()))
      throw new LAPVerificationException(
          "Bad token lemma for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.lemma
              + "\", got \""
              + token.getLemma().getValue()
              + "\"");
    if (!info.posType.equals(token.getPos().getType().getShortName()))
      throw new LAPVerificationException(
          "Bad token POS type for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.posType
              + ", got "
              + token.getPos().getType().getShortName());
    if (!info.posValue.equals(token.getPos().getPosValue()))
      throw new LAPVerificationException(
          "Bad token POS value for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.posValue
              + "\", got \""
              + token.getPos().getPosValue()
              + "\"");

    String nerType = null;
    List<NamedEntity> ners = JCasUtil.selectCovered(NamedEntity.class, token);
    if (ners.size() == 1) {
      nerType = ners.get(0).getType().getShortName();
    } else if (ners.size() > 1) {
      throw new LAPVerificationException(
          "Got more than one NER annotation for " + info.id + ":" + info.text + " - " + ners);
    }
    if (!Objects.equals(info.nerType, nerType))
      throw new LAPVerificationException(
          "Bad token NER value for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.nerType
              + "\", got \""
              + nerType
              + "\"");

    Set<TestDependencyInfo> infoDependencies =
        new HashSet<TestDependencyInfo>(Arrays.asList(info.dependencies));
    if (!infoDependencies.equals(governors.get(token)))
      throw new LAPVerificationException(
          "Bad token dependencies for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + infoDependencies
              + ", got "
              + governors.get(token));

    System.out.println("Verified token: " + info);
  }
Ejemplo n.º 19
0
  @Override
  public void process(JCas jcas) throws AnalysisEngineProcessException {
    getContext().getLogger().log(Level.CONFIG, "Entering " + this.getClass().getSimpleName());

    Type tokenType = jcas.getCas().getTypeSystem().getType(Token.class.getCanonicalName());
    Type stemType = jcas.getCas().getTypeSystem().getType(Stem.class.getCanonicalName());
    Type lemmaType = jcas.getCas().getTypeSystem().getType(Lemma.class.getCanonicalName());
    Type posType = jcas.getCas().getTypeSystem().getType(POS.class.getCanonicalName());
    Type typeToRemoveType = jcas.getCas().getTypeSystem().getType(typeToRemove);

    if (typeToRemoveType == null) {
      throw new AnalysisEngineProcessException(
          new Throwable("Could not get type for feature path: " + typeToRemove));
    }

    List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>();
    try {
      for (Entry<AnnotationFS, String> entry :
          FeaturePathFactory.select(jcas.getCas(), typeToRemove)) {
        AnnotationFS annotation = entry.getKey();
        AnnotationFS pos;
        if (typeToRemoveType.equals(posType)) {
          pos = annotation;
        } else {
          pos = getAnnotation(posType, annotation);
          if (pos == null) {
            continue;
          }
        }

        String posString = pos.getType().getShortName();
        if (posString.equals("ADJ") && !adj) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("ADV") && !adv) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("ART") && !art) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("CARD") && !card) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("CONJ") && !conj) {
          toRemove.add(annotation);
          continue;
        }
        if ((posString.equals("N") || posString.equals("NN") || posString.equals("NP")) && !n) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("O") && !o) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("PP") && !pp) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("PR") && !pr) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("PUNC") && !punc) {
          toRemove.add(annotation);
          continue;
        }
        if (posString.equals("V") && !v) {
          toRemove.add(annotation);
          continue;
        }
      }
    } catch (FeaturePathException e) {
      throw new AnalysisEngineProcessException(e);
    }

    for (AnnotationFS fs : toRemove) {
      // If we want to remove tokens, we also remove accompanying lemma, stem, POS tag.
      if (fs.getType().equals(tokenType)) {
        AnnotationFS stemFS = getAnnotation(stemType, fs);
        if (stemFS != null) {
          jcas.getCas().removeFsFromIndexes(stemFS);
        }
        AnnotationFS lemmaFS = getAnnotation(lemmaType, fs);
        if (lemmaFS != null) {
          jcas.getCas().removeFsFromIndexes(lemmaFS);
        }
        AnnotationFS posFS = getAnnotation(posType, fs);
        if (posFS != null) {
          jcas.getCas().removeFsFromIndexes(posFS);
        }
      }
      // We don't want to keep the feature in the token, remove it here.
      else {
        if (fs.getType().equals(stemType) || fs.getType().equals(lemmaType)) {
          Token token = (Token) getAnnotation(tokenType, fs);
          if (token != null) {
            String fbn = fs.getType().getShortName().toLowerCase();
            Feature f = tokenType.getFeatureByBaseName(fbn);
            token.setFeatureValue(f, null);
          }
        } else if (fs instanceof POS) {
          Token token = (Token) getAnnotation(tokenType, fs);
          if (token != null) {
            token.setPos(null);
          }
        }
      }

      jcas.getCas().removeFsFromIndexes(fs);
    }
  }
Ejemplo n.º 20
0
 @Override
 public void process(JCas aJCas) throws AnalysisEngineProcessException {
   for (Token token : select(aJCas, Token.class)) {
     System.out.printf("%s\t%s%n", token.getCoveredText(), token.getPos().getPosValue());
   }
 }