Esempio n. 1
0
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    try {
      if (readPos) {
        posMappingProvider.configure(aJCas.getCas());
      }

      if (readConstituent) {
        constituentMappingProvider.configure(aJCas.getCas());
      }
    } catch (AnalysisEngineProcessException e) {
      throw new IOException(e);
    }

    Map<String, CoreferenceLink> chains = new HashMap<>();

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aJCas, aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      StringBuilder parse = new StringBuilder();

      // Tokens, Lemma, POS
      Map<Integer, Token> tokenById = new HashMap<Integer, Token>();
      List<SemPred> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokenById.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        if (!UNUSED.equals(word[PARSE]) && readConstituent) {
          String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")");
          parse.append(fixed);
        }

        if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) {
          WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd());
          wordSense.setValue(word[WORD_SENSE]);
          wordSense.addToIndexes();
        }

        if (!UNUSED.equals(word[word.length - 1]) && readCoreference) {
          String[] chainFragments = word[word.length - 1].split("\\|");
          for (String chainFragment : chainFragments) {
            boolean beginning = chainFragment.startsWith("(");
            boolean ending = chainFragment.endsWith(")");

            String chainId =
                chainFragment.substring(
                    beginning ? 1 : 0,
                    ending ? chainFragment.length() - 1 : chainFragment.length());

            CoreferenceLink link = chains.get(chainId);
            if (beginning) {
              if (link == null) {
                link = new CoreferenceLink(aJCas);
                CoreferenceChain chain = new CoreferenceChain(aJCas);
                chain.setFirst(link);
                chain.addToIndexes();
              } else {
                CoreferenceLink newLink = new CoreferenceLink(aJCas);
                link.setNext(newLink);
                link = newLink;
              }
              link.setReferenceType(chainId);
              link.setBegin(token.getBegin());
            }

            if (ending) {
              link.setEnd(token.getEnd());
              link.addToIndexes();
            }

            chains.put(chainId, link);
          }
        }

        sentenceEnd = token.getEnd();
      }

      // Named entities
      if (readNamedEntity) {
        int currentNeBegin = -1;
        String currentNeType = null;
        for (int i = 0; i < words.size(); i++) {
          String ne = words.get(i)[NAMED_ENTITIES];
          boolean beginning = ne.startsWith("(");
          boolean ending = ne.endsWith(")");

          // When a NE is beginning, we remember what the NE is and where it began
          if (beginning) {
            // The NE is beginning with "(" and either ending with "(" or "*", so we trim
            // the first and last character
            currentNeType = ne.substring(1, ne.length() - 1);
            currentNeBegin = i;
          }

          // We need to create an annotation if the current token is the end of an annotation
          if (ending) {
            // Determine begin and end of named entity
            int begin = tokenById.get(currentNeBegin).getBegin();
            int end = tokenById.get(i).getEnd();

            // Add named entity
            NamedEntity namedEntity = new NamedEntity(aJCas, begin, end);
            namedEntity.setValue(currentNeType);
            namedEntity.addToIndexes();

            // Forget remembered named entity
            currentNeBegin = -1;
            currentNeType = null;
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          SemPred pred = preds.get(p);
          List<SemArgLink> args = new ArrayList<>();

          int currentArgBegin = -1;
          String currentArgType = null;
          for (int i = 0; i < words.size(); i++) {
            String ne = words.get(i)[APRED + p];
            boolean beginning = ne.startsWith("(");
            boolean ending = ne.endsWith(")");

            // When a arg is beginning, we remember what the NE is and where it began
            if (beginning) {
              // The arg is beginning with "(" and either ending with "(" or "*", so
              // we trim the first and last character
              currentArgType = ne.substring(1, ne.length() - 1);
              currentArgBegin = i;
            }

            // We need to create an annotation if the current token is the end of an
            // annotation
            if (ending) {
              // Determine begin and end of argument
              int begin = tokenById.get(currentArgBegin).getBegin();
              int end = tokenById.get(i).getEnd();

              // Add named entity unless it is a (V*) which has the same offsets as
              // the predicate
              if (!(pred.getBegin() == begin && pred.getEnd() == end)) {
                SemArg arg = new SemArg(aJCas, begin, end);
                arg.addToIndexes();

                SemArgLink link = new SemArgLink(aJCas);
                link.setRole(currentArgType);
                link.setTarget(arg);
                args.add(link);
              }

              // Forget remembered arg
              currentArgBegin = -1;
              currentArgType = null;
            }
          }

          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString()));

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
  public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
    if (readPos) {
      try {
        posMappingProvider.configure(aJCas.getCas());
      } catch (AnalysisEngineProcessException e) {
        throw new IOException(e);
      }
    }

    JCasBuilder doc = new JCasBuilder(aJCas);

    List<String[]> words;
    while ((words = readSentence(aReader)) != null) {
      if (words.isEmpty()) {
        // Ignore empty sentences. This can happen when there are multiple end-of-sentence
        // markers following each other.
        continue;
      }

      int sentenceBegin = doc.getPosition();
      int sentenceEnd = sentenceBegin;

      // Tokens, Lemma, POS
      Map<Integer, Token> tokens = new HashMap<Integer, Token>();
      List<SemanticPredicate> preds = new ArrayList<>();
      for (String[] word : words) {
        // Read token
        Token token = doc.add(word[FORM], Token.class);
        tokens.put(Integer.valueOf(word[ID]), token);
        doc.add(" ");

        // Read lemma
        if (!UNUSED.equals(word[LEMMA]) && readLemma) {
          Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
          lemma.setValue(word[LEMMA]);
          lemma.addToIndexes();
          token.setLemma(lemma);
        }

        // Read part-of-speech tag
        if (!UNUSED.equals(word[POS]) && readPos) {
          Type posTag = posMappingProvider.getTagType(word[POS]);
          POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
          pos.setPosValue(word[POS]);
          pos.addToIndexes();
          token.setPos(pos);
        }

        // Read morphological features
        if (!UNUSED.equals(word[FEAT]) && readMorph) {
          MorphologicalFeatures morphtag =
              new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
          morphtag.setValue(word[FEAT]);
          morphtag.addToIndexes();
        }

        if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
          SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd());
          pred.setCategory(word[PRED]);
          pred.addToIndexes();
          preds.add(pred);
        }

        sentenceEnd = token.getEnd();
      }

      // Dependencies
      if (readDependency) {
        for (String[] word : words) {
          if (!UNUSED.equals(word[DEPREL])) {
            int depId = Integer.valueOf(word[ID]);
            int govId = Integer.valueOf(word[HEAD]);

            // Model the root as a loop onto itself
            if (govId == 0) {
              Dependency rel = new ROOT(aJCas);
              rel.setGovernor(tokens.get(depId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            } else {
              Dependency rel = new Dependency(aJCas);
              rel.setGovernor(tokens.get(govId));
              rel.setDependent(tokens.get(depId));
              rel.setDependencyType(word[DEPREL]);
              rel.setBegin(rel.getDependent().getBegin());
              rel.setEnd(rel.getDependent().getEnd());
              rel.addToIndexes();
            }
          }
        }
      }

      // Semantic arguments
      if (readSemanticPredicate) {
        // Get arguments for one predicate at a time
        for (int p = 0; p < preds.size(); p++) {
          List<SemanticArgument> args = new ArrayList<SemanticArgument>();
          for (String[] word : words) {
            if (!UNUSED.equals(word[APRED + p])) {
              Token token = tokens.get(Integer.valueOf(word[ID]));
              SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd());
              arg.setRole(word[APRED + p]);
              arg.addToIndexes();
              args.add(arg);
            }
          }
          SemanticPredicate pred = preds.get(p);
          pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
        }
      }

      // Sentence
      Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
      sentence.addToIndexes();

      // Once sentence per line.
      doc.add("\n");
    }

    doc.close();
  }
  @Override
  public void process(final JCas aJCas) throws AnalysisEngineProcessException {
    final CAS cas = aJCas.getCas();

    modelProvider.configure(cas);
    posMappingProvider.configure(cas);

    TreeTaggerWrapper<Token> treetagger = modelProvider.getResource();

    try {
      List<Token> tokens = new ArrayList<Token>(select(aJCas, Token.class));
      final POS pos[] = new POS[tokens.size()];
      final Lemma lemma[] = new Lemma[tokens.size()];

      // Set the handler creating new UIMA annotations from the analyzed
      // tokens
      final AtomicInteger count = new AtomicInteger(0);
      treetagger.setHandler(
          new TokenHandler<Token>() {
            @Override
            public void token(Token aToken, String aPos, String aLemma) {
              synchronized (cas) {
                // Add the Part of Speech
                if (writePos && aPos != null) {
                  Type posTag = posMappingProvider.getTagType(aPos);
                  POS posAnno =
                      (POS) cas.createAnnotation(posTag, aToken.getBegin(), aToken.getEnd());
                  posAnno.setPosValue(internTags ? aPos.intern() : aPos);
                  aToken.setPos(posAnno);
                  pos[count.get()] = posAnno;
                }

                // Add the lemma
                if (writeLemma && aLemma != null) {
                  Lemma lemmaAnno = new Lemma(aJCas, aToken.getBegin(), aToken.getEnd());
                  lemmaAnno.setValue(internTags ? aLemma.intern() : aLemma);
                  aToken.setLemma(lemmaAnno);
                  lemma[count.get()] = lemmaAnno;
                }

                count.getAndIncrement();
              }
            }
          });

      treetagger.process(tokens);

      // Add the annotations to the indexes
      for (int i = 0; i < count.get(); i++) {
        if (pos[i] != null) {
          pos[i].addToIndexes();
        }
        if (lemma[i] != null) {
          lemma[i].addToIndexes();
        }
      }
    } catch (TreeTaggerException e) {
      throw new AnalysisEngineProcessException(e);
    } catch (IOException e) {
      throw new AnalysisEngineProcessException(e);
    }
  }