Пример #1
0
  private void convert(JCas aJCas, PrintWriter aOut) {
    Type chunkType = JCasUtil.getType(aJCas, Chunk.class);
    Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue");

    for (Sentence sentence : select(aJCas, Sentence.class)) {
      HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>();

      // Tokens
      List<Token> tokens = selectCovered(Token.class, sentence);

      // Chunks
      IobEncoder encoder = new IobEncoder(aJCas.getCas(), chunkType, chunkValue);

      for (int i = 0; i < tokens.size(); i++) {
        Row row = new Row();
        row.id = i + 1;
        row.token = tokens.get(i);
        row.chunk = encoder.encode(tokens.get(i));
        ctokens.put(row.token, row);
      }

      // Write sentence in CONLL 2006 format
      for (Row row : ctokens.values()) {
        String pos = UNUSED;
        if (writePos && (row.token.getPos() != null)) {
          POS posAnno = row.token.getPos();
          pos = posAnno.getPosValue();
        }

        String chunk = UNUSED;
        if (writeChunk && (row.chunk != null)) {
          chunk = encoder.encode(row.token);
        }

        aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk);
      }

      aOut.println();
    }
  }
Пример #2
0
  private void convert(JCas aJCas, PrintWriter aOut) {
    Map<Token, Collection<SemanticPredicate>> predIdx =
        indexCovered(aJCas, Token.class, SemanticPredicate.class);
    Map<SemanticArgument, Collection<Token>> argIdx =
        indexCovered(aJCas, SemanticArgument.class, Token.class);
    for (Sentence sentence : select(aJCas, Sentence.class)) {
      HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>();

      // Tokens
      List<Token> tokens = selectCovered(Token.class, sentence);

      // Check if we should try to include the FEATS in output
      List<MorphologicalFeatures> morphology = selectCovered(MorphologicalFeatures.class, sentence);
      boolean useFeats = tokens.size() == morphology.size();

      List<SemanticPredicate> preds = selectCovered(SemanticPredicate.class, sentence);

      for (int i = 0; i < tokens.size(); i++) {
        Row row = new Row();
        row.id = i + 1;
        row.token = tokens.get(i);
        row.args = new SemanticArgument[preds.size()];
        if (useFeats) {
          row.feats = morphology.get(i);
        }

        // If there are multiple semantic predicates for the current token, then
        // we keep only the first
        Collection<SemanticPredicate> predsForToken = predIdx.get(row.token);
        if (predsForToken != null && !predsForToken.isEmpty()) {
          row.pred = predsForToken.iterator().next();
        }
        ctokens.put(row.token, row);
      }

      // Dependencies
      for (Dependency rel : selectCovered(Dependency.class, sentence)) {
        ctokens.get(rel.getDependent()).deprel = rel;
      }

      // Semantic arguments
      for (int p = 0; p < preds.size(); p++) {
        FSArray args = preds.get(p).getArguments();
        for (SemanticArgument arg : select(args, SemanticArgument.class)) {
          for (Token t : argIdx.get(arg)) {
            Row row = ctokens.get(t);
            row.args[p] = arg;
          }
        }
      }

      // Write sentence in CONLL 2009 format
      for (Row row : ctokens.values()) {
        int id = row.id;

        String form = row.token.getCoveredText();

        String lemma = UNUSED;
        if (writeLemma && (row.token.getLemma() != null)) {
          lemma = row.token.getLemma().getValue();
        }
        String plemma = lemma;

        String pos = UNUSED;
        if (writePos && (row.token.getPos() != null)) {
          POS posAnno = row.token.getPos();
          pos = posAnno.getPosValue();
        }
        String ppos = pos;

        String feat = UNUSED;
        if (writeMorph && (row.feats != null)) {
          feat = row.feats.getValue();
        }
        String pfeat = feat;

        int headId = UNUSED_INT;
        String deprel = UNUSED;
        if (writeDependency && (row.deprel != null)) {
          deprel = row.deprel.getDependencyType();
          headId = ctokens.get(row.deprel.getGovernor()).id;
          if (headId == row.id) {
            // ROOT dependencies may be modeled as a loop, ignore these.
            headId = 0;
          }
        }

        String head = UNUSED;
        if (headId != UNUSED_INT) {
          head = Integer.toString(headId);
        }

        String phead = head;
        String pdeprel = deprel;

        String fillpred = UNUSED;
        String pred = UNUSED;
        StringBuilder apreds = new StringBuilder();
        if (writeSemanticPredicate) {
          if (row.pred != null) {
            fillpred = "Y";
            pred = row.pred.getCategory();
          }

          for (SemanticArgument arg : row.args) {
            if (apreds.length() > 0) {
              apreds.append('\t');
            }
            apreds.append(arg != null ? arg.getRole() : UNUSED);
          }
        }

        aOut.printf(
            "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
            id, form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred,
            pred, apreds);
      }

      aOut.println();
    }
  }
Пример #3
0
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
      try {
        posMappingProvider.configure(aJCas.getCas());
      } catch (AnalysisEngineProcessException e) {
        throw new IOException(e);
      }
    }

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      // Tokens, Lemma, POS
      Map<Integer, Token> tokens = new HashMap<Integer, Token>();
      List<SemanticPredicate> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokens.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        // Read morphological features
        if (!UNUSED.equals(word[FEAT]) && readMorph) {
          MorphologicalFeatures morphtag =
              new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
          morphtag.setValue(word[FEAT]);
          morphtag.addToIndexes();
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        sentenceEnd = token.getEnd();
      }

      // Dependencies
      if (readDependency) {
        for (String[] word : words) {
          if (!UNUSED.equals(word[DEPREL])) {
            int depId = Integer.valueOf(word[ID]);
            int govId = Integer.valueOf(word[HEAD]);

            // Model the root as a loop onto itself
            if (govId == 0) {
              Dependency rel = new ROOT(aJCas);
              rel.setGovernor(tokens.get(depId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            } else {
              Dependency rel = new Dependency(aJCas);
              rel.setGovernor(tokens.get(govId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            }
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          List<SemanticArgument> args = new ArrayList<SemanticArgument>();
          for (String[] word : words) {
            if (!UNUSED.equals(word[APRED + p])) {
              Token token = tokens.get(Integer.valueOf(word[ID]));
              SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd());
              arg.setRole(word[APRED + p]);
              arg.addToIndexes();
              args.add(arg);
            }
          }
          SemanticPredicate pred = preds.get(p);
          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
Пример #4
0
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    try {
      if (readPos) {
        posMappingProvider.configure(aJCas.getCas());
      }

      if (readConstituent) {
        constituentMappingProvider.configure(aJCas.getCas());
      }
    } catch (AnalysisEngineProcessException e) {
      throw new IOException(e);
    }

    Map<String, CoreferenceLink> chains = new HashMap<>();

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aJCas, aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      StringBuilder parse = new StringBuilder();

      // Tokens, Lemma, POS
      Map<Integer, Token> tokenById = new HashMap<Integer, Token>();
      List<SemPred> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokenById.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        if (!UNUSED.equals(word[PARSE]) && readConstituent) {
          String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")");
          parse.append(fixed);
        }

        if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) {
          WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd());
          wordSense.setValue(word[WORD_SENSE]);
          wordSense.addToIndexes();
        }

        if (!UNUSED.equals(word[word.length - 1]) && readCoreference) {
          String[] chainFragments = word[word.length - 1].split("\\|");
          for (String chainFragment : chainFragments) {
            boolean beginning = chainFragment.startsWith("(");
            boolean ending = chainFragment.endsWith(")");

            String chainId =
                chainFragment.substring(
                    beginning ? 1 : 0,
                    ending ? chainFragment.length() - 1 : chainFragment.length());

            CoreferenceLink link = chains.get(chainId);
            if (beginning) {
              if (link == null) {
                link = new CoreferenceLink(aJCas);
                CoreferenceChain chain = new CoreferenceChain(aJCas);
                chain.setFirst(link);
                chain.addToIndexes();
              } else {
                CoreferenceLink newLink = new CoreferenceLink(aJCas);
                link.setNext(newLink);
                link = newLink;
              }
              link.setReferenceType(chainId);
              link.setBegin(token.getBegin());
            }

            if (ending) {
              link.setEnd(token.getEnd());
              link.addToIndexes();
            }

            chains.put(chainId, link);
          }
        }

        sentenceEnd = token.getEnd();
      }

      // Named entities
      if (readNamedEntity) {
        int currentNeBegin = -1;
        String currentNeType = null;
        for (int i = 0; i < words.size(); i++) {
          String ne = words.get(i)[NAMED_ENTITIES];
          boolean beginning = ne.startsWith("(");
          boolean ending = ne.endsWith(")");

          // When a NE is beginning, we remember what the NE is and where it began
          if (beginning) {
            // The NE is beginning with "(" and either ending with "(" or "*", so we trim
            // the first and last character
            currentNeType = ne.substring(1, ne.length() - 1);
            currentNeBegin = i;
          }

          // We need to create an annotation if the current token is the end of an annotation
          if (ending) {
            // Determine begin and end of named entity
            int begin = tokenById.get(currentNeBegin).getBegin();
            int end = tokenById.get(i).getEnd();

            // Add named entity
            NamedEntity namedEntity = new NamedEntity(aJCas, begin, end);
            namedEntity.setValue(currentNeType);
            namedEntity.addToIndexes();

            // Forget remembered named entity
            currentNeBegin = -1;
            currentNeType = null;
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          SemPred pred = preds.get(p);
          List<SemArgLink> args = new ArrayList<>();

          int currentArgBegin = -1;
          String currentArgType = null;
          for (int i = 0; i < words.size(); i++) {
            String ne = words.get(i)[APRED + p];
            boolean beginning = ne.startsWith("(");
            boolean ending = ne.endsWith(")");

            // When a arg is beginning, we remember what the NE is and where it began
            if (beginning) {
              // The arg is beginning with "(" and either ending with "(" or "*", so
              // we trim the first and last character
              currentArgType = ne.substring(1, ne.length() - 1);
              currentArgBegin = i;
            }

            // We need to create an annotation if the current token is the end of an
            // annotation
            if (ending) {
              // Determine begin and end of argument
              int begin = tokenById.get(currentArgBegin).getBegin();
              int end = tokenById.get(i).getEnd();

              // Add named entity unless it is a (V*) which has the same offsets as
              // the predicate
              if (!(pred.getBegin() == begin && pred.getEnd() == end)) {
                SemArg arg = new SemArg(aJCas, begin, end);
                arg.addToIndexes();

                SemArgLink link = new SemArgLink(aJCas);
                link.setRole(currentArgType);
                link.setTarget(arg);
                args.add(link);
              }

              // Forget remembered arg
              currentArgBegin = -1;
              currentArgType = null;
            }
          }

          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString()));

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
Пример #5
0
  public static Tree createStanfordTree(Annotation root, TreeFactory tFact) {
    JCas aJCas;
    try {
      aJCas = root.getCAS().getJCas();
    } catch (CASException e) {
      throw new IllegalStateException("Unable to get JCas from JCas wrapper");
    }

    // define the new (root) node
    Tree rootNode;

    // before we can create a node, we must check if we have any children (we have to know
    // whether to create a node or a leaf - not very dynamic)
    if (root instanceof Constituent && !isLeaf((Constituent) root)) {
      Constituent node = (Constituent) root;
      List<Tree> childNodes = new ArrayList<Tree>();

      // get childNodes from child annotations
      FSArray children = node.getChildren();
      for (int i = 0; i < children.size(); i++) {
        childNodes.add(createStanfordTree(node.getChildren(i), tFact));
      }

      // now create the node with its children
      rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes);

    } else {
      // Handle leaf annotations
      // Leafs are always Token-annotations
      // We also have to insert a Preterminal node with the value of the
      // POS-Annotation on the token
      // because the POS is not directly stored within the treee
      Token wordAnnotation = (Token) root;

      // create leaf-node for the tree
      Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText());

      // create information about preceding and trailing whitespaces in the leaf node
      StringBuilder preWhitespaces = new StringBuilder();
      StringBuilder trailWhitespaces = new StringBuilder();

      List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1);
      List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1);

      if (precedingTokenList.size() > 0) {
        Token precedingToken = precedingTokenList.get(0);
        int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd();
        for (int i = 0; i < precedingWhitespaces; i++) {
          preWhitespaces.append(" ");
        }
      }
      if (followingTokenList.size() > 0) {
        Token followingToken = followingTokenList.get(0);
        int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd();
        for (int i = 0; i < trailingWhitespaces; i++) {
          trailWhitespaces.append(" ");
        }
      }

      // write whitespace information as CoreAnnotation.BeforeAnnotation and
      // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to
      // node label
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString());
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString());

      // get POS-annotation
      // get the token that is covered by the POS
      List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation);
      // the POS should only cover one token
      assert coveredPos.size() == 1;
      POS pos = coveredPos.get(0);

      // create POS-Node in the tree and attach word-node to it
      rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode})));
    }

    return rootNode;
  }