/**
  * Extracts the number (sg/pl) from the Token. To be applied only on nouns / pronouns.
  *
  * @param token
  * @return
  */
 public String getNumber(Token token) {
   String pos = token.getPos().getPosValue();
   if (!isNounOrPronoun(token)) {
     System.err.println(
         "Use method only for nouns / pronouns. " + pos + " " + token.getCoveredText());
     // throw new IllegalArgumentException();
     return "unknown"; // occurs e.g. for 'there' (existential)
   }
   if (pos.matches("NNP?S")) {
     return "pl";
   }
   if (pos.matches("NNP?")) {
     return "sg";
   }
   if (pos.matches("PRP\\$?|CD")) {
     String lemma = token.getLemma().getValue().toLowerCase();
     if (lemma.matches(
         "I|me|myself|he|him|himself|she|her|herself|it|itself|one|onself|mine|thine|his|hers")) {
       return "sg";
     }
     if (lemma.matches(
         "we|us|ourselves|ourself|yourselves|they|them|themselves|theirselves|theirs|ours")) {
       return "pl";
     }
   }
   return "unknown";
 }
  /**
   * Returns countability information according to Celex database of English nouns.
   *
   * @param token
   * @return
   */
  public String getCountability(Token token) {
    if (!USE_CELEX) {
      System.err.println(
          "This should never happen, don't call this function if you did not configure to use Celex!");
      throw new IllegalStateException();
    }

    if (!token.getPos().getPosValue().startsWith("N")) {
      return "NO-NOUN";
    }
    if (countability.containsKey(token.getLemma().getValue().toLowerCase())) {
      return countability.get(token.getLemma().getValue().toLowerCase());
    } else {
      return "none";
    }
  }
 public static boolean isNounOrPronoun(Token token) {
   String pos = token.getPos().getPosValue();
   // JJ: allows things like "British" / "Australian" which are marked as
   // NEs in ACE
   if (!(pos.startsWith("N")
       || pos.matches("PRP\\$?|CD|JJS?")
       || pos.matches("DT|WHNP|WP|PRP$?")
       || (pos.matches("WDT|WP") && token.getLemma().getValue().matches("who|which|that")))) {
     return false;
   }
   return true;
 }
 /**
  * Extracts person from Token. To be applied only on nouns / pronouns.
  *
  * @param token
  * @return
  */
 public String getPerson(Token token) {
   if (!isNounOrPronoun(token)) {
     if (token.getPos().getPosValue().equals("EX")) {
       return "3"; // existential 'there'
     } else {
       System.err.println("Use getPerson method only for nouns / pronouns.");
       throw new IllegalArgumentException();
     }
   }
   String lemma = token.getLemma().getValue().toLowerCase();
   String person = "3";
   if (lemma.matches("i|we|me|us|myself|ourselves|ourself")) {
     person = "1";
   } else if (lemma.matches("you|ye|thou|thee|yourself|thyself|yourselves|yourself")) {
     person = "2";
   }
   return person;
 }
 /**
  * Extracts the noun type from the POS tag. Returns proper/common/pronoun.
  *
  * @param token
  * @return
  */
 public static String getNounType(Token token) {
   if (!isNounOrPronoun(token)) {
     if (token.getPos().getPosValue().equals("EX")) {
       return "unknown"; // existential 'there'
     } else {
       System.err.println("Use getPerson method only for nouns / pronouns.");
       throw new IllegalArgumentException();
     }
   }
   if (token.getPos().getPosValue().matches("NNPS?")) {
     return "proper";
   }
   if (token.getPos().getPosValue().matches("NNS?")) {
     return "common";
   }
   if (isPronoun(token.getPos().getPosValue(), token.getLemma().getValue())) {
     return "pronoun";
   }
   return "unknown";
 }
Ejemplo n.º 6
0
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    // Convert UIMA to LIF Container
    Container container = new Container();
    container.setLanguage(aJCas.getDocumentLanguage());
    container.setText(aJCas.getDocumentText());

    View view = container.newView();

    // Paragraph
    for (Paragraph p : select(aJCas, Paragraph.class)) {
      view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd());
    }

    // Sentence
    for (Sentence s : select(aJCas, Sentence.class)) {
      view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd());
    }

    // Token, POS, Lemma
    for (Token t : select(aJCas, Token.class)) {
      Annotation a =
          view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd());
      if (t.getPos() != null) {
        a.addFeature(Features.Token.POS, t.getPos().getPosValue());
      }

      if (t.getLemma() != null) {
        a.addFeature(Features.Token.LEMMA, t.getLemma().getValue());
      }
    }

    // NamedEntity
    for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) {
      Annotation ne =
          view.newAnnotation(
              id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd());
      ne.setLabel(neAnno.getValue());
    }

    // Dependency
    for (Sentence s : select(aJCas, Sentence.class)) {
      Set<String> depRelIds = new TreeSet<>();

      for (Dependency dep : selectCovered(Dependency.class, s)) {
        String depRelId = id(DEPENDENCY, dep);
        // LAPPS dependencies inherit from Relation which has no offsets
        Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY);
        depRel.setLabel(dep.getDependencyType());
        depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor()));
        depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent()));
        depRelIds.add(depRelId);
      }

      if (!depRelIds.isEmpty()) {
        Annotation depStruct =
            view.newAnnotation(
                id(DEPENDENCY_STRUCTURE, s),
                Discriminators.Uri.DEPENDENCY_STRUCTURE,
                s.getBegin(),
                s.getEnd());
        depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds);
      }
    }

    // Constituents
    for (ROOT r : select(aJCas, ROOT.class)) {
      Set<String> constituents = new LinkedHashSet<>();
      convertConstituent(view, r, constituents);

      Annotation phraseStruct =
          view.newAnnotation(
              id(PHRASE_STRUCTURE, r),
              Discriminators.Uri.PHRASE_STRUCTURE,
              r.getBegin(),
              r.getEnd());
      phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents);
    }

    try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
      String json = Serializer.toPrettyJson(container);
      IOUtils.write(json, docOS, encoding);
    } catch (Exception e) {
      throw new AnalysisEngineProcessException(e);
    }
  }
  private void verifyToken(Token token, TestTokenInfo info) throws LAPVerificationException {
    if (!info.text.equals(token.getCoveredText()))
      throw new LAPVerificationException(
          "Bad token text for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.text
              + "\", got \""
              + token.getCoveredText()
              + "\"");
    if (info.begin != token.getBegin())
      throw new LAPVerificationException(
          "Bad token begin index for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.begin
              + ", got "
              + token.getBegin());
    if (info.end != token.getEnd())
      throw new LAPVerificationException(
          "Bad token end index for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.end
              + ", got "
              + token.getEnd());
    if (!info.lemma.equals(token.getLemma().getValue()))
      throw new LAPVerificationException(
          "Bad token lemma for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.lemma
              + "\", got \""
              + token.getLemma().getValue()
              + "\"");
    if (!info.posType.equals(token.getPos().getType().getShortName()))
      throw new LAPVerificationException(
          "Bad token POS type for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + info.posType
              + ", got "
              + token.getPos().getType().getShortName());
    if (!info.posValue.equals(token.getPos().getPosValue()))
      throw new LAPVerificationException(
          "Bad token POS value for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.posValue
              + "\", got \""
              + token.getPos().getPosValue()
              + "\"");

    String nerType = null;
    List<NamedEntity> ners = JCasUtil.selectCovered(NamedEntity.class, token);
    if (ners.size() == 1) {
      nerType = ners.get(0).getType().getShortName();
    } else if (ners.size() > 1) {
      throw new LAPVerificationException(
          "Got more than one NER annotation for " + info.id + ":" + info.text + " - " + ners);
    }
    if (!Objects.equals(info.nerType, nerType))
      throw new LAPVerificationException(
          "Bad token NER value for "
              + info.id
              + ":"
              + info.text
              + ", expected \""
              + info.nerType
              + "\", got \""
              + nerType
              + "\"");

    Set<TestDependencyInfo> infoDependencies =
        new HashSet<TestDependencyInfo>(Arrays.asList(info.dependencies));
    if (!infoDependencies.equals(governors.get(token)))
      throw new LAPVerificationException(
          "Bad token dependencies for "
              + info.id
              + ":"
              + info.text
              + ", expected "
              + infoDependencies
              + ", got "
              + governors.get(token));

    System.out.println("Verified token: " + info);
  }