Java Lemma Examples, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma Java Examples

Example #1

0

Show file

File: SegmenterCompatibilityTest.java Project: caffeinator13/dkpro-core

 private void checkLemma(String[] expected, Collection<Lemma> actual) {
   int i = 0;
   for (Lemma lemmaAnnotation : actual) {
     assertEquals("In position " + i, expected[i], lemmaAnnotation.getValue());
     i++;
   }
 }

Example #2

0

Show file

File: MalletTopicModelEstimator.java Project: mjunsilo/dkpro-core

  private static Collection<String> getTokensFromAnnotation(
      AnnotationFS annotation, boolean useLemma, int minTokenLength) {
    Collection<String> tokens;
    if (useLemma) {
      tokens = new ArrayList<>();

      /* concatenate multiple lemmas: */
      // selectCovered(Lemma.class, annotation).stream()
      // .map(lemma -> lemma.getValue())
      // .filter(lemma -> lemma.length() >= minTokenLength)
      // .reduce((l1, l2) -> l1 + "_" + l2)
      // .ifPresent(token -> tokens.add(token));

      for (Lemma lemma : selectCovered(Lemma.class, annotation)) {
        String text = lemma.getValue();
        if (text.length() >= minTokenLength) {
          tokens.add(text);
        }
      }
    } else {
      tokens = new ArrayList<>(1);
      String text = annotation.getCoveredText();
      if (text.length() >= minTokenLength) {
        tokens.add(text);
      }
    }
    return tokens;
  }

Example #3

0

Show file

File: Conll2009Reader.java Project: alainloisel/dkpro-core

  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
      try {
        posMappingProvider.configure(aJCas.getCas());
      } catch (AnalysisEngineProcessException e) {
        throw new IOException(e);
      }
    }

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      // Tokens, Lemma, POS
      Map<Integer, Token> tokens = new HashMap<Integer, Token>();
      List<SemanticPredicate> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokens.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        // Read morphological features
        if (!UNUSED.equals(word[FEAT]) && readMorph) {
          MorphologicalFeatures morphtag =
              new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
          morphtag.setValue(word[FEAT]);
          morphtag.addToIndexes();
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        sentenceEnd = token.getEnd();
      }

      // Dependencies
      if (readDependency) {
        for (String[] word : words) {
          if (!UNUSED.equals(word[DEPREL])) {
            int depId = Integer.valueOf(word[ID]);
            int govId = Integer.valueOf(word[HEAD]);

            // Model the root as a loop onto itself
            if (govId == 0) {
              Dependency rel = new ROOT(aJCas);
              rel.setGovernor(tokens.get(depId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            } else {
              Dependency rel = new Dependency(aJCas);
              rel.setGovernor(tokens.get(govId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            }
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          List<SemanticArgument> args = new ArrayList<SemanticArgument>();
          for (String[] word : words) {
            if (!UNUSED.equals(word[APRED + p])) {
              Token token = tokens.get(Integer.valueOf(word[ID]));
              SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd());
              arg.setRole(word[APRED + p]);
              arg.addToIndexes();
              args.add(arg);
            }
          }
          SemanticPredicate pred = preds.get(p);
          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }

Example #4

0

Show file

File: Conll2012Reader.java Project: laugehoyer/dkpro-core

  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    try {
      if (readPos) {
        posMappingProvider.configure(aJCas.getCas());
      }

      if (readConstituent) {
        constituentMappingProvider.configure(aJCas.getCas());
      }
    } catch (AnalysisEngineProcessException e) {
      throw new IOException(e);
    }

    Map<String, CoreferenceLink> chains = new HashMap<>();

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aJCas, aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      StringBuilder parse = new StringBuilder();

      // Tokens, Lemma, POS
      Map<Integer, Token> tokenById = new HashMap<Integer, Token>();
      List<SemPred> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokenById.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        if (!UNUSED.equals(word[PARSE]) && readConstituent) {
          String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")");
          parse.append(fixed);
        }

        if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) {
          WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd());
          wordSense.setValue(word[WORD_SENSE]);
          wordSense.addToIndexes();
        }

        if (!UNUSED.equals(word[word.length - 1]) && readCoreference) {
          String[] chainFragments = word[word.length - 1].split("\\|");
          for (String chainFragment : chainFragments) {
            boolean beginning = chainFragment.startsWith("(");
            boolean ending = chainFragment.endsWith(")");

            String chainId =
                chainFragment.substring(
                    beginning ? 1 : 0,
                    ending ? chainFragment.length() - 1 : chainFragment.length());

            CoreferenceLink link = chains.get(chainId);
            if (beginning) {
              if (link == null) {
                link = new CoreferenceLink(aJCas);
                CoreferenceChain chain = new CoreferenceChain(aJCas);
                chain.setFirst(link);
                chain.addToIndexes();
              } else {
                CoreferenceLink newLink = new CoreferenceLink(aJCas);
                link.setNext(newLink);
                link = newLink;
              }
              link.setReferenceType(chainId);
              link.setBegin(token.getBegin());
            }

            if (ending) {
              link.setEnd(token.getEnd());
              link.addToIndexes();
            }

            chains.put(chainId, link);
          }
        }

        sentenceEnd = token.getEnd();
      }

      // Named entities
      if (readNamedEntity) {
        int currentNeBegin = -1;
        String currentNeType = null;
        for (int i = 0; i < words.size(); i++) {
          String ne = words.get(i)[NAMED_ENTITIES];
          boolean beginning = ne.startsWith("(");
          boolean ending = ne.endsWith(")");

          // When a NE is beginning, we remember what the NE is and where it began
          if (beginning) {
            // The NE is beginning with "(" and either ending with "(" or "*", so we trim
            // the first and last character
            currentNeType = ne.substring(1, ne.length() - 1);
            currentNeBegin = i;
          }

          // We need to create an annotation if the current token is the end of an annotation
          if (ending) {
            // Determine begin and end of named entity
            int begin = tokenById.get(currentNeBegin).getBegin();
            int end = tokenById.get(i).getEnd();

            // Add named entity
            NamedEntity namedEntity = new NamedEntity(aJCas, begin, end);
            namedEntity.setValue(currentNeType);
            namedEntity.addToIndexes();

            // Forget remembered named entity
            currentNeBegin = -1;
            currentNeType = null;
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          SemPred pred = preds.get(p);
          List<SemArgLink> args = new ArrayList<>();

          int currentArgBegin = -1;
          String currentArgType = null;
          for (int i = 0; i < words.size(); i++) {
            String ne = words.get(i)[APRED + p];
            boolean beginning = ne.startsWith("(");
            boolean ending = ne.endsWith(")");

            // When a arg is beginning, we remember what the NE is and where it began
            if (beginning) {
              // The arg is beginning with "(" and either ending with "(" or "*", so
              // we trim the first and last character
              currentArgType = ne.substring(1, ne.length() - 1);
              currentArgBegin = i;
            }

            // We need to create an annotation if the current token is the end of an
            // annotation
            if (ending) {
              // Determine begin and end of argument
              int begin = tokenById.get(currentArgBegin).getBegin();
              int end = tokenById.get(i).getEnd();

              // Add named entity unless it is a (V*) which has the same offsets as
              // the predicate
              if (!(pred.getBegin() == begin && pred.getEnd() == end)) {
                SemArg arg = new SemArg(aJCas, begin, end);
                arg.addToIndexes();

                SemArgLink link = new SemArgLink(aJCas);
                link.setRole(currentArgType);
                link.setTarget(arg);
                args.add(link);
              }

              // Forget remembered arg
              currentArgBegin = -1;
              currentArgType = null;
            }
          }

          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString()));

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }