// todo: initialize document language from article metadata private void initializeCas(JCas jCas, PubmedArticle article) { JCasBuilder builder = new JCasBuilder(jCas); jCas.setDocumentLanguage("en"); if (article.getMedlineCitation().getArticle().getAbstract() != null) { String abstractText = article.getMedlineCitation().getArticle().getAbstract().getAbstractText(); builder.add(abstractText); } String articleTitle = article.getMedlineCitation().getArticle().getArticleTitle(); builder.add(articleTitle); builder.close(); }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } } JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; // Tokens, Lemma, POS Map<Integer, Token> tokens = new HashMap<Integer, Token>(); List<SemanticPredicate> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokens.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } // Read morphological features if (!UNUSED.equals(word[FEAT]) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); morphtag.setValue(word[FEAT]); morphtag.addToIndexes(); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } sentenceEnd = token.getEnd(); } // Dependencies if (readDependency) { for (String[] word : words) { if (!UNUSED.equals(word[DEPREL])) { int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(word[HEAD]); // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { List<SemanticArgument> args = new ArrayList<SemanticArgument>(); for (String[] word : words) { if (!UNUSED.equals(word[APRED + p])) { Token token = tokens.get(Integer.valueOf(word[ID])); SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd()); arg.setRole(word[APRED + p]); arg.addToIndexes(); args.add(arg); } } SemanticPredicate pred = preds.get(p); pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { try { if (readPos) { posMappingProvider.configure(aJCas.getCas()); } if (readConstituent) { constituentMappingProvider.configure(aJCas.getCas()); } } catch (AnalysisEngineProcessException e) { throw new IOException(e); } Map<String, CoreferenceLink> chains = new HashMap<>(); JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aJCas, aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; StringBuilder parse = new StringBuilder(); // Tokens, Lemma, POS Map<Integer, Token> tokenById = new HashMap<Integer, Token>(); List<SemPred> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokenById.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } if (!UNUSED.equals(word[PARSE]) && readConstituent) { String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")"); parse.append(fixed); } if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) { WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd()); wordSense.setValue(word[WORD_SENSE]); wordSense.addToIndexes(); } if (!UNUSED.equals(word[word.length - 1]) && readCoreference) { String[] chainFragments = word[word.length - 1].split("\\|"); for (String chainFragment : chainFragments) { boolean beginning = chainFragment.startsWith("("); boolean ending = chainFragment.endsWith(")"); String chainId = chainFragment.substring( beginning ? 1 : 0, ending ? chainFragment.length() - 1 : chainFragment.length()); CoreferenceLink link = chains.get(chainId); if (beginning) { if (link == null) { link = new CoreferenceLink(aJCas); CoreferenceChain chain = new CoreferenceChain(aJCas); chain.setFirst(link); chain.addToIndexes(); } else { CoreferenceLink newLink = new CoreferenceLink(aJCas); link.setNext(newLink); link = newLink; } link.setReferenceType(chainId); link.setBegin(token.getBegin()); } if (ending) { link.setEnd(token.getEnd()); link.addToIndexes(); } chains.put(chainId, link); } } sentenceEnd = token.getEnd(); } // Named entities if (readNamedEntity) { int currentNeBegin = -1; String currentNeType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[NAMED_ENTITIES]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a NE is beginning, we remember what the NE is and where it began if (beginning) { // The NE is beginning with "(" and either ending with "(" or "*", so we trim // the first and last character currentNeType = ne.substring(1, ne.length() - 1); currentNeBegin = i; } // We need to create an annotation if the current token is the end of an annotation if (ending) { // Determine begin and end of named entity int begin = tokenById.get(currentNeBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity NamedEntity namedEntity = new NamedEntity(aJCas, begin, end); namedEntity.setValue(currentNeType); namedEntity.addToIndexes(); // Forget remembered named entity currentNeBegin = -1; currentNeType = null; } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { SemPred pred = preds.get(p); List<SemArgLink> args = new ArrayList<>(); int currentArgBegin = -1; String currentArgType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[APRED + p]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a arg is beginning, we remember what the NE is and where it began if (beginning) { // The arg is beginning with "(" and either ending with "(" or "*", so // we trim the first and last character currentArgType = ne.substring(1, ne.length() - 1); currentArgBegin = i; } // We need to create an annotation if the current token is the end of an // annotation if (ending) { // Determine begin and end of argument int begin = tokenById.get(currentArgBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity unless it is a (V*) which has the same offsets as // the predicate if (!(pred.getBegin() == begin && pred.getEnd() == end)) { SemArg arg = new SemArg(aJCas, begin, end); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); link.setRole(currentArgType); link.setTarget(arg); args.add(link); } // Forget remembered arg currentArgBegin = -1; currentArgType = null; } } pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); // Once sentence per line. doc.add("\n"); } doc.close(); }