public void add(JCas jcas, Set<String> inputPaths, Type sentenceType) throws IOException { ConditionalFrequencyDistribution<Integer, String> cfd = new ConditionalFrequencyDistribution<Integer, String>(); CAS cas = jcas.getCas(); for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) { for (String path : inputPaths) { String[] segments = path.split("/", 2); String typeName = segments[0]; Type type = getInputType(cas, typeName); List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation); List<String> tokenStrings; try { tokenStrings = createStringList(tokens, segments); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) { cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength, ngramLength)); } } } add(cfd); }
/** Create a new link annotation. Already adds the chain to the CAS. */ private AnnotationFS newLink( JCas aJCas, int aBegin, int aEnd, AnnotationFeature aFeature, String aLabelValue) { String baseName = StringUtils.substringBeforeLast(getAnnotationTypeName(), CHAIN) + LINK; Type linkType = CasUtil.getType(aJCas.getCas(), baseName); AnnotationFS newLink = aJCas.getCas().createAnnotation(linkType, aBegin, aEnd); BratAjaxCasUtil.setFeature(newLink, aFeature, aLabelValue); aJCas.getCas().addFsToIndexes(newLink); return newLink; }
/** * Generate an instance from the text covered by the given annotation. * * @param annotation an annotation representing a document segment, e.g. {@link Sentence}. * @param tokenType the type to use for representing tokens, usually {@link Token}, but could also * be any other type. * @return */ private TokenSequence generateTokenSequence(AnnotationFS annotation, Type tokenType) { TokenSequence tokenSequence = new TokenSequence(); for (AnnotationFS token : CasUtil.selectCovered(tokenType, annotation)) { for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) { tokenSequence.add(tokenText); } } return tokenSequence; }
/** * Generate a TokenSequence from the whole document. * * @param aJCas a CAS holding the document * @param tokenType this type will be used as token, e.g. Token, N-gram etc. * @param useLemma if this is true, use lemmas * @param minTokenLength the minimum token length to use * @return a {@link TokenSequence} * @throws FeaturePathException if the annotation type specified in {@code PARAM_TYPE_NAME} cannot * be extracted. */ protected static TokenSequence generateTokenSequence( JCas aJCas, Type tokenType, boolean useLemma, int minTokenLength) throws FeaturePathException { TokenSequence tokenSequence = new TokenSequence(); for (AnnotationFS token : CasUtil.select(aJCas.getCas(), tokenType)) { for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) { tokenSequence.add(tokenText); } } return tokenSequence; }
/** * Generate one or multiple TokenSequences from the given document. If {@code * PARAM_MODEL_ENTITY_TYPE} is set, an instance is generated from each segment annotated with the * given type. Otherwise, one instance is generated from the whole document. * * @param aJCas * @return * @throws FeaturePathException */ protected Collection<TokenSequence> generateTokenSequences(JCas aJCas) throws FeaturePathException { Collection<TokenSequence> tokenSequences; CAS cas = aJCas.getCas(); Type tokenType = CasUtil.getType(cas, typeName); if (modelEntityType == null) { /* generate only one tokenSequence */ tokenSequences = new ArrayList<>(1); tokenSequences.add(generateTokenSequence(aJCas, tokenType, useLemma, minTokenLength)); } else { /* generate tokenSequences for every segment (e.g. sentence) */ tokenSequences = new ArrayList<>(); for (AnnotationFS segment : CasUtil.select(cas, CasUtil.getType(cas, modelEntityType))) { tokenSequences.add(generateTokenSequence(segment, tokenType)); } } return tokenSequences; }
@Test public void testGetDocumentCas() throws ResourceInitializationException, IOException, SAXException, URISyntaxException, ParserConfigurationException { CAS aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "1", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertEquals(6, CasUtil.selectAll(aCAS).size()); assertEquals( 1, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); aCAS = CasCreationUtils.createCas( XmiFileTreeCorpusDAO.getTypeSystem(corpusPathString), null, null, null); corpusDAO.getDocumentCas(new URI("62007.txt"), "5", aCAS); assertThat(aCAS.getDocumentText(), containsString("РИА Новости")); assertThat(CasUtil.selectAll(aCAS).size(), equalTo(5)); assertEquals( 0, CasUtil.select(aCAS, CasUtil.getAnnotationType(aCAS, "ru.kfu.itis.issst.evex.Weapon")) .size()); }
/** * Returns the (one) annotation of a given type that is aligned with another annotation. * * @param annotation An annotation. * @return The annotation aligned with another annotation. */ private AnnotationFS getAnnotation(Type type, AnnotationFS annotation) { List<AnnotationFS> annotations = CasUtil.selectCovered(annotation.getCAS(), type, annotation); if (annotations.size() != 1) { getLogger() .warn( "Could not find matching annotation of type " + type + " for annotation: " + annotation.getCoveredText()); return null; } return annotations.get(0); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) { // If there is a constraint, check if it matches if (constraint != null) { JXPathContext ctx = JXPathContext.newContext(cover); boolean match = ctx.iterate(constraint).hasNext(); if (!match) { continue; } } // If the target type is a token, use it directly, otherwise select the covered tokens Collection<Token> tokens; if (cover instanceof Token) { tokens = Collections.singleton((Token) cover); } else { tokens = JCasUtil.selectCovered(aJCas, Token.class, cover); } for (Token token : tokens) { try { String semanticField = semanticFieldResource.getSemanticTag(token); SemanticField semanticFieldAnnotation = new SemanticField(aJCas, token.getBegin(), token.getEnd()); semanticFieldAnnotation.setValue(semanticField); semanticFieldAnnotation.addToIndexes(); } catch (ResourceAccessException e) { throw new AnalysisEngineProcessException(e); } } } }
@Override public Type getAnnotationType(CAS cas) { return CasUtil.getType(cas, annotationTypeName); }