private void convert(JCas aJCas, PrintWriter aOut) { Type chunkType = JCasUtil.getType(aJCas, Chunk.class); Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); for (Sentence sentence : select(aJCas, Sentence.class)) { HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>(); // Tokens List<Token> tokens = selectCovered(Token.class, sentence); // Chunks IobEncoder encoder = new IobEncoder(aJCas.getCas(), chunkType, chunkValue); for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); row.id = i + 1; row.token = tokens.get(i); row.chunk = encoder.encode(tokens.get(i)); ctokens.put(row.token, row); } // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { String pos = UNUSED; if (writePos && (row.token.getPos() != null)) { POS posAnno = row.token.getPos(); pos = posAnno.getPosValue(); } String chunk = UNUSED; if (writeChunk && (row.chunk != null)) { chunk = encoder.encode(row.token); } aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk); } aOut.println(); } }
private void convert(JCas aJCas, PrintWriter aOut) { Map<Token, Collection<SemanticPredicate>> predIdx = indexCovered(aJCas, Token.class, SemanticPredicate.class); Map<SemanticArgument, Collection<Token>> argIdx = indexCovered(aJCas, SemanticArgument.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>(); // Tokens List<Token> tokens = selectCovered(Token.class, sentence); // Check if we should try to include the FEATS in output List<MorphologicalFeatures> morphology = selectCovered(MorphologicalFeatures.class, sentence); boolean useFeats = tokens.size() == morphology.size(); List<SemanticPredicate> preds = selectCovered(SemanticPredicate.class, sentence); for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); row.id = i + 1; row.token = tokens.get(i); row.args = new SemanticArgument[preds.size()]; if (useFeats) { row.feats = morphology.get(i); } // If there are multiple semantic predicates for the current token, then // we keep only the first Collection<SemanticPredicate> predsForToken = predIdx.get(row.token); if (predsForToken != null && !predsForToken.isEmpty()) { row.pred = predsForToken.iterator().next(); } ctokens.put(row.token, row); } // Dependencies for (Dependency rel : selectCovered(Dependency.class, sentence)) { ctokens.get(rel.getDependent()).deprel = rel; } // Semantic arguments for (int p = 0; p < preds.size(); p++) { FSArray args = preds.get(p).getArguments(); for (SemanticArgument arg : select(args, SemanticArgument.class)) { for (Token t : argIdx.get(arg)) { Row row = ctokens.get(t); row.args[p] = arg; } } } // Write sentence in CONLL 2009 format for (Row row : ctokens.values()) { int id = row.id; String form = row.token.getCoveredText(); String lemma = UNUSED; if (writeLemma && (row.token.getLemma() != null)) { lemma = row.token.getLemma().getValue(); } String plemma = lemma; String pos = UNUSED; if (writePos && (row.token.getPos() != null)) { POS posAnno = row.token.getPos(); pos = posAnno.getPosValue(); } String ppos = pos; String feat = UNUSED; if (writeMorph && (row.feats != null)) { feat = row.feats.getValue(); } String pfeat = feat; int headId = UNUSED_INT; String deprel = UNUSED; if (writeDependency && (row.deprel != null)) { deprel = row.deprel.getDependencyType(); headId = ctokens.get(row.deprel.getGovernor()).id; if (headId == row.id) { // ROOT dependencies may be modeled as a loop, ignore these. headId = 0; } } String head = UNUSED; if (headId != UNUSED_INT) { head = Integer.toString(headId); } String phead = head; String pdeprel = deprel; String fillpred = UNUSED; String pred = UNUSED; StringBuilder apreds = new StringBuilder(); if (writeSemanticPredicate) { if (row.pred != null) { fillpred = "Y"; pred = row.pred.getCategory(); } for (SemanticArgument arg : row.args) { if (apreds.length() > 0) { apreds.append('\t'); } apreds.append(arg != null ? arg.getRole() : UNUSED); } } aOut.printf( "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id, form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, pred, apreds); } aOut.println(); } }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } } JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; // Tokens, Lemma, POS Map<Integer, Token> tokens = new HashMap<Integer, Token>(); List<SemanticPredicate> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokens.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } // Read morphological features if (!UNUSED.equals(word[FEAT]) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); morphtag.setValue(word[FEAT]); morphtag.addToIndexes(); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } sentenceEnd = token.getEnd(); } // Dependencies if (readDependency) { for (String[] word : words) { if (!UNUSED.equals(word[DEPREL])) { int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(word[HEAD]); // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { List<SemanticArgument> args = new ArrayList<SemanticArgument>(); for (String[] word : words) { if (!UNUSED.equals(word[APRED + p])) { Token token = tokens.get(Integer.valueOf(word[ID])); SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd()); arg.setRole(word[APRED + p]); arg.addToIndexes(); args.add(arg); } } SemanticPredicate pred = preds.get(p); pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { try { if (readPos) { posMappingProvider.configure(aJCas.getCas()); } if (readConstituent) { constituentMappingProvider.configure(aJCas.getCas()); } } catch (AnalysisEngineProcessException e) { throw new IOException(e); } Map<String, CoreferenceLink> chains = new HashMap<>(); JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aJCas, aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; StringBuilder parse = new StringBuilder(); // Tokens, Lemma, POS Map<Integer, Token> tokenById = new HashMap<Integer, Token>(); List<SemPred> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokenById.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } if (!UNUSED.equals(word[PARSE]) && readConstituent) { String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")"); parse.append(fixed); } if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) { WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd()); wordSense.setValue(word[WORD_SENSE]); wordSense.addToIndexes(); } if (!UNUSED.equals(word[word.length - 1]) && readCoreference) { String[] chainFragments = word[word.length - 1].split("\\|"); for (String chainFragment : chainFragments) { boolean beginning = chainFragment.startsWith("("); boolean ending = chainFragment.endsWith(")"); String chainId = chainFragment.substring( beginning ? 1 : 0, ending ? chainFragment.length() - 1 : chainFragment.length()); CoreferenceLink link = chains.get(chainId); if (beginning) { if (link == null) { link = new CoreferenceLink(aJCas); CoreferenceChain chain = new CoreferenceChain(aJCas); chain.setFirst(link); chain.addToIndexes(); } else { CoreferenceLink newLink = new CoreferenceLink(aJCas); link.setNext(newLink); link = newLink; } link.setReferenceType(chainId); link.setBegin(token.getBegin()); } if (ending) { link.setEnd(token.getEnd()); link.addToIndexes(); } chains.put(chainId, link); } } sentenceEnd = token.getEnd(); } // Named entities if (readNamedEntity) { int currentNeBegin = -1; String currentNeType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[NAMED_ENTITIES]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a NE is beginning, we remember what the NE is and where it began if (beginning) { // The NE is beginning with "(" and either ending with "(" or "*", so we trim // the first and last character currentNeType = ne.substring(1, ne.length() - 1); currentNeBegin = i; } // We need to create an annotation if the current token is the end of an annotation if (ending) { // Determine begin and end of named entity int begin = tokenById.get(currentNeBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity NamedEntity namedEntity = new NamedEntity(aJCas, begin, end); namedEntity.setValue(currentNeType); namedEntity.addToIndexes(); // Forget remembered named entity currentNeBegin = -1; currentNeType = null; } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { SemPred pred = preds.get(p); List<SemArgLink> args = new ArrayList<>(); int currentArgBegin = -1; String currentArgType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[APRED + p]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a arg is beginning, we remember what the NE is and where it began if (beginning) { // The arg is beginning with "(" and either ending with "(" or "*", so // we trim the first and last character currentArgType = ne.substring(1, ne.length() - 1); currentArgBegin = i; } // We need to create an annotation if the current token is the end of an // annotation if (ending) { // Determine begin and end of argument int begin = tokenById.get(currentArgBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity unless it is a (V*) which has the same offsets as // the predicate if (!(pred.getBegin() == begin && pred.getEnd() == end)) { SemArg arg = new SemArg(aJCas, begin, end); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); link.setRole(currentArgType); link.setTarget(arg); args.add(link); } // Forget remembered arg currentArgBegin = -1; currentArgType = null; } } pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); // Once sentence per line. doc.add("\n"); } doc.close(); }
public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()) .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()) .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation // get the token that is covered by the POS List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation); // the POS should only cover one token assert coveredPos.size() == 1; POS pos = coveredPos.get(0); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode}))); } return rootNode; }