Example #1
0
  public static void main(String[] args) throws Exception {
    JCas jCas = JCasFactory.createJCas();
    jCas.setDocumentLanguage("de");
    jCas.setDocumentText(
        "Die Fossillagerstätte Geiseltal befindet sich im ehemaligen Braunkohlerevier des Geiseltales südlich der Stadt Halle in Sachsen-Anhalt. Sie ist eine bedeutende Fundstelle heute ausgestorbener Pflanzen und Tiere aus der Zeit des Mittleren Eozäns vor 48 bis 41 Millionen Jahren. Im Geiseltal wurde nachweislich seit 1698 erstmals Kohle gefördert, die ersten Fossilien kamen aber erst Anfang des 20. Jahrhunderts eher zufällig zu Tage. Planmäßige wissenschaftliche Ausgrabungen begannen 1925 seitens der Martin-Luther-Universität Halle-Wittenberg. Unterbrochen durch den Zweiten Weltkrieg, können die Untersuchungen in zwei Forschungsphasen untergliedert werden. Aufgrund der zunehmenden Auskohlung der Rohstofflager kamen die Ausgrabungen Mitte der 1980er allmählich zum Erliegen und endeten endgültig zu Beginn des dritten Jahrtausends.");

    SimplePipeline.runPipeline(
        jCas,
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
        AnalysisEngineFactory.createEngineDescription(StanfordNamedEntityRecognizer.class),
        AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class));

    for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
      System.out.println("Found NE: " + ne.getValue() + ", " + ne.getCoveredText());
    }
  }
Example #2
0
  @Override
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    // Convert UIMA to LIF Container
    Container container = new Container();
    container.setLanguage(aJCas.getDocumentLanguage());
    container.setText(aJCas.getDocumentText());

    View view = container.newView();

    // Paragraph
    for (Paragraph p : select(aJCas, Paragraph.class)) {
      view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd());
    }

    // Sentence
    for (Sentence s : select(aJCas, Sentence.class)) {
      view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd());
    }

    // Token, POS, Lemma
    for (Token t : select(aJCas, Token.class)) {
      Annotation a =
          view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd());
      if (t.getPos() != null) {
        a.addFeature(Features.Token.POS, t.getPos().getPosValue());
      }

      if (t.getLemma() != null) {
        a.addFeature(Features.Token.LEMMA, t.getLemma().getValue());
      }
    }

    // NamedEntity
    for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) {
      Annotation ne =
          view.newAnnotation(
              id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd());
      ne.setLabel(neAnno.getValue());
    }

    // Dependency
    for (Sentence s : select(aJCas, Sentence.class)) {
      Set<String> depRelIds = new TreeSet<>();

      for (Dependency dep : selectCovered(Dependency.class, s)) {
        String depRelId = id(DEPENDENCY, dep);
        // LAPPS dependencies inherit from Relation which has no offsets
        Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY);
        depRel.setLabel(dep.getDependencyType());
        depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor()));
        depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent()));
        depRelIds.add(depRelId);
      }

      if (!depRelIds.isEmpty()) {
        Annotation depStruct =
            view.newAnnotation(
                id(DEPENDENCY_STRUCTURE, s),
                Discriminators.Uri.DEPENDENCY_STRUCTURE,
                s.getBegin(),
                s.getEnd());
        depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds);
      }
    }

    // Constituents
    for (ROOT r : select(aJCas, ROOT.class)) {
      Set<String> constituents = new LinkedHashSet<>();
      convertConstituent(view, r, constituents);

      Annotation phraseStruct =
          view.newAnnotation(
              id(PHRASE_STRUCTURE, r),
              Discriminators.Uri.PHRASE_STRUCTURE,
              r.getBegin(),
              r.getEnd());
      phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents);
    }

    try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
      String json = Serializer.toPrettyJson(container);
      IOUtils.write(json, docOS, encoding);
    } catch (Exception e) {
      throw new AnalysisEngineProcessException(e);
    }
  }
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    try {
      if (readPos) {
        posMappingProvider.configure(aJCas.getCas());
      }

      if (readConstituent) {
        constituentMappingProvider.configure(aJCas.getCas());
      }
    } catch (AnalysisEngineProcessException e) {
      throw new IOException(e);
    }

    Map<String, CoreferenceLink> chains = new HashMap<>();

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aJCas, aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      StringBuilder parse = new StringBuilder();

      // Tokens, Lemma, POS
      Map<Integer, Token> tokenById = new HashMap<Integer, Token>();
      List<SemPred> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokenById.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        if (!UNUSED.equals(word[PARSE]) && readConstituent) {
          String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")");
          parse.append(fixed);
        }

        if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) {
          WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd());
          wordSense.setValue(word[WORD_SENSE]);
          wordSense.addToIndexes();
        }

        if (!UNUSED.equals(word[word.length - 1]) && readCoreference) {
          String[] chainFragments = word[word.length - 1].split("\\|");
          for (String chainFragment : chainFragments) {
            boolean beginning = chainFragment.startsWith("(");
            boolean ending = chainFragment.endsWith(")");

            String chainId =
                chainFragment.substring(
                    beginning ? 1 : 0,
                    ending ? chainFragment.length() - 1 : chainFragment.length());

            CoreferenceLink link = chains.get(chainId);
            if (beginning) {
              if (link == null) {
                link = new CoreferenceLink(aJCas);
                CoreferenceChain chain = new CoreferenceChain(aJCas);
                chain.setFirst(link);
                chain.addToIndexes();
              } else {
                CoreferenceLink newLink = new CoreferenceLink(aJCas);
                link.setNext(newLink);
                link = newLink;
              }
              link.setReferenceType(chainId);
              link.setBegin(token.getBegin());
            }

            if (ending) {
              link.setEnd(token.getEnd());
              link.addToIndexes();
            }

            chains.put(chainId, link);
          }
        }

        sentenceEnd = token.getEnd();
      }

      // Named entities
      if (readNamedEntity) {
        int currentNeBegin = -1;
        String currentNeType = null;
        for (int i = 0; i < words.size(); i++) {
          String ne = words.get(i)[NAMED_ENTITIES];
          boolean beginning = ne.startsWith("(");
          boolean ending = ne.endsWith(")");

          // When a NE is beginning, we remember what the NE is and where it began
          if (beginning) {
            // The NE is beginning with "(" and either ending with "(" or "*", so we trim
            // the first and last character
            currentNeType = ne.substring(1, ne.length() - 1);
            currentNeBegin = i;
          }

          // We need to create an annotation if the current token is the end of an annotation
          if (ending) {
            // Determine begin and end of named entity
            int begin = tokenById.get(currentNeBegin).getBegin();
            int end = tokenById.get(i).getEnd();

            // Add named entity
            NamedEntity namedEntity = new NamedEntity(aJCas, begin, end);
            namedEntity.setValue(currentNeType);
            namedEntity.addToIndexes();

            // Forget remembered named entity
            currentNeBegin = -1;
            currentNeType = null;
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          SemPred pred = preds.get(p);
          List<SemArgLink> args = new ArrayList<>();

          int currentArgBegin = -1;
          String currentArgType = null;
          for (int i = 0; i < words.size(); i++) {
            String ne = words.get(i)[APRED + p];
            boolean beginning = ne.startsWith("(");
            boolean ending = ne.endsWith(")");

            // When a arg is beginning, we remember what the NE is and where it began
            if (beginning) {
              // The arg is beginning with "(" and either ending with "(" or "*", so
              // we trim the first and last character
              currentArgType = ne.substring(1, ne.length() - 1);
              currentArgBegin = i;
            }

            // We need to create an annotation if the current token is the end of an
            // annotation
            if (ending) {
              // Determine begin and end of argument
              int begin = tokenById.get(currentArgBegin).getBegin();
              int end = tokenById.get(i).getEnd();

              // Add named entity unless it is a (V*) which has the same offsets as
              // the predicate
              if (!(pred.getBegin() == begin && pred.getEnd() == end)) {
                SemArg arg = new SemArg(aJCas, begin, end);
                arg.addToIndexes();

                SemArgLink link = new SemArgLink(aJCas);
                link.setRole(currentArgType);
                link.setTarget(arg);
                args.add(link);
              }

              // Forget remembered arg
              currentArgBegin = -1;
              currentArgType = null;
            }
          }

          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString()));

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }