private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID) throws ExecutionException { for (Annotation annot : toTransfer) { Mapping m = mappings.get(annot.getType()); String name = (m == null || m.newName == null ? annot.getType() : m.newName); try { FeatureMap params = Factory.newFeatureMap(); params.putAll(annot.getFeatures()); if (newID) { to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } else { to.add( annot.getId(), annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } } catch (InvalidOffsetException e) { throw new ExecutionException(e); } } }
// generate annotations for ngrams over a larger span e.g all couples inside // a span of 5 tokens // this allows to match more variants e.g. with adjectives in the middle // we do not generate intermediate annotations here // do with only bigrams for the moment private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; // create a temporary list containing all the annotations // at position 0 List<Annotation> headannots = boxes.get(b); for (Annotation newAnn : headannots) { // remembering positions loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String string = (String) newAnn.getFeatures().get(inputAnnotationFeature); tempAnnotationsStartingHere.add(string); if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, string); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } for (int z = 1; z < window && (b + z < boxes.size()); z++) { // generate all possible bi-grams List<Annotation> current = boxes.get(b + z); for (Annotation newAnn : current) { // remembering positions if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // take what is in the buffer // and make a new annotation out of that for (String s : tempAnnotationsStartingHere) { String combination = s + getNgramSeparator() + newString; // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } } } catch (Exception e) { throw new ExecutionException(e); } }
private void generateNGrams(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { // now do the actual n-grams for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) { // do the combination and dump what we've done at every step // e.g generate 1 grams as well as 2-grams List<Annotation> current = boxes.get(b + z); List<String> temptemp = new ArrayList<String>(); for (Annotation newAnn : current) { // remembering positions if (loStart == null) loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // TODO : what if there is no such value???? if (tempAnnotationsStartingHere.size() == 0) { // create an annotation for the current annotation if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, newString); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } // add it to the temp temptemp.add(newString); } else for (String existing : tempAnnotationsStartingHere) { String combination = existing + getNgramSeparator() + newString; temptemp.add(combination); if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) { // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } tempAnnotationsStartingHere = temptemp; } } } catch (Exception e) { throw new ExecutionException(e); } }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(outputASName); List<Annotation> tokens = new ArrayList<Annotation>( document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE)); Collections.sort(tokens, new OffsetComparator()); String[] strings = new String[tokens.size()]; for (int i = 0; i < tokens.size(); ++i) { strings[i] = (String) tokens.get(i).getFeatures().get("string"); } try { TagList tags = tagger.tag(strings); Iterator<Tag> it = tags.iterator(); while (it.hasNext()) { Tag tag = it.next(); outputAS.add( tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(), tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(), tag.getTagname(), Factory.newFeatureMap()); } } catch (Exception ioe) { throw new ExecutionException("Tagger Failed", ioe); } }
public void tokenize() { AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) { Annotation currentTokenAnnotation = it.next(); FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures(); FeatureMap curFeaturesMap = Factory.newFeatureMap(); if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) { curFeaturesMap.put("string", tokenFeaturesMap.get("string")); curFeaturesMap.put("root", tokenFeaturesMap.get("lemma")); curFeaturesMap.put("category", tokenFeaturesMap.get("POS")); // Add the new Token to the Annotation Set defaultAs.add( currentTokenAnnotation.getStartNode(), currentTokenAnnotation.getEndNode(), currentTokenAnnotation.getType(), curFeaturesMap); } } gateDocument.removeAnnotationSet("Tokenization"); }
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
public void splitter() { AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) { Annotation currentSentenceAnnotation = it.next(); // Add the Sentence to the Annotation Set defaultAs.add( currentSentenceAnnotation.getStartNode(), currentSentenceAnnotation.getEndNode(), "Sentence", null); } gateDocument.removeAnnotationSet("SentenceDetection"); }
/** * Creates the Lookup annotations according to a gazetteer match. * * @param matchingState the final FSMState that was reached while matching. * @param matchedRegionStart the start of the matched text region. * @param matchedRegionEnd the end of the matched text region. * @param annotationSet the annotation set where the new annotations should be added. */ protected void createLookups( FSMState matchingState, long matchedRegionStart, long matchedRegionEnd, AnnotationSet annotationSet) { Iterator lookupIter = matchingState.getLookupSet().iterator(); while (lookupIter.hasNext()) { Lookup currentLookup = (Lookup) lookupIter.next(); FeatureMap fm = Factory.newFeatureMap(); fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); if (null != currentLookup.oClass && null != currentLookup.ontology) { fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass); fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology); } if (null != currentLookup.minorType) fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); if (null != currentLookup.languages) fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages); if (null != currentLookup.features) { fm.putAll(currentLookup.features); } try { // if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){ // annotationSet.add(new Long(matchedRegionStart), // new Long(matchedRegionEnd + 1), // LOOKUP_ANNOTATION_TYPE, // fm); // }else{ annotationSet.add( new Long(matchedRegionStart), new Long(matchedRegionEnd + 1), currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag. fm); // } } catch (InvalidOffsetException ioe) { throw new GateRuntimeException(ioe.toString()); } } // while(lookupIter.hasNext()) }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(annotationSetName); String text = document.getContent().toString(); Span[] tokens = tokenizer.getTokens(text); try { for (Span token : tokens) { FeatureMap features = Factory.newFeatureMap(); features.put( ANNIEConstants.TOKEN_STRING_FEATURE_NAME, text.substring(token.getStart(), token.getEnd())); outputAS.add( (long) token.getStart(), (long) token.getEnd(), ANNIEConstants.TOKEN_ANNOTATION_TYPE, features); } } catch (Exception e) { throw new ExecutionException("error running tokenizer", e); } }
/** * Rename annotation * * @param outputAS output annotation set * @param oldType old annotation name * @param newType new annotation name */ private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) { AnnotationSet tmpAnatomyAS = outputAS.get(oldType); for (Annotation tmpAnn : tmpAnatomyAS) { Long startOffset = tmpAnn.getStartNode().getOffset(); Long endOffset = tmpAnn.getEndNode().getOffset(); AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset); // If we've already got an annotation of the same name in the same place, don't add a new one // just delete the old one if (existingAS.isEmpty()) { FeatureMap tmpFm = tmpAnn.getFeatures(); FeatureMap fm = Factory.newFeatureMap(); fm.putAll(tmpFm); try { outputAS.add(startOffset, endOffset, newType, fm); outputAS.remove(tmpAnn); } catch (InvalidOffsetException ie) { // shouldn't happen } } else { outputAS.remove(tmpAnn); } } }
/** * @param inputAS input annotation set * @param outputAS output annotation set * @param term String matched * @param startOffset match start offset * @param endOffset match end offset */ private void addLookup( AnnotationSet inputAS, AnnotationSet outputAS, String term, String outputASType, Long startOffset, Long endOffset, boolean useNounChunk) { if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) { AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset); if (!nounChunkAS.isEmpty()) { startOffset = nounChunkAS.firstNode().getOffset(); endOffset = nounChunkAS.lastNode().getOffset(); } } try { AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset); if (diseaseAS.isEmpty()) { FeatureMap fm = Factory.newFeatureMap(); fm.put("match", term); outputAS.add(startOffset, endOffset, outputASType, fm); } else { Annotation disease = diseaseAS.iterator().next(); FeatureMap fm = disease.getFeatures(); String meta = (String) fm.get("match"); if (meta != null) { meta = meta + " " + term; } fm.put("match", meta); } } catch (InvalidOffsetException ie) { // shouldn't happen gate.util.Err.println(ie); } }
public void execute() throws ExecutionException { // get the sentence splitter file from the URL provided File splitter = Files.fileFromURL(splitterBinary); // get the document content and replace non-breaking spaces with spaces // TODO replace new-lines with spaces so we don't get a sentence per line String docContent = document.getContent().toString().replace((char) 160, ' '); try { // create temporary files to use with the external sentence splitter File tmpIn = File.createTempFile("GENIA", ".txt"); File tmpOut = File.createTempFile("GENIA", ".txt"); // store the document content in the input file FileOutputStream fos = new FileOutputStream(tmpIn); fos.write(docContent.getBytes("utf8")); fos.close(); // setup the command line to run the sentence splitter String[] args = new String[] { splitter.getAbsolutePath(), tmpIn.getAbsolutePath(), tmpOut.getAbsolutePath() }; // run the sentence splitter over the docuement manager.runProcess( args, splitter.getParentFile(), (debug ? System.out : null), (debug ? System.err : null)); // get the annotation set we are going to store results in AnnotationSet annotationSet = document.getAnnotations(annotationSetName); // we haven't found any sentence yet so start looking for the next one // from the beginning of the document int end = 0; // read in the output from the sentence splitter one line at a time BufferedReader in = new BufferedReader(new FileReader(tmpOut)); String sentence = in.readLine(); while (sentence != null) { // trim the sentence so we don't annotate extranious white space, // this isn't python code after all :) sentence = sentence.trim(); // find the start of the sentence // TODO throw a sensible exception if the sentence can't be found? int start = docContent.indexOf(sentence, end); // work out where the sentence ends end = start + sentence.length(); if (end > start) { // the sentence has a length so annotate it annotationSet.add((long) start, (long) end, "Sentence", Factory.newFeatureMap()); } // get the next line from the output from the tagger sentence = in.readLine(); } // delete the temp files if (!debug && !tmpIn.delete()) tmpIn.deleteOnExit(); if (!debug && !tmpOut.delete()) tmpOut.deleteOnExit(); } catch (Exception ioe) { throw new ExecutionException("An error occured running the splitter", ioe); } }
/** * This method annotates paragraphs in a GATE document. The investigated text spans beetween start * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName * is null then they are creted in the default annotation set. * * @param aDoc is the gate document on which the paragraph detection would be performed.If it is * null or its content it's null then the method woul simply return doing nothing. * @param startOffset is the index form the document content from which the paragraph detection * will start * @param endOffset is the offset where the detection will end. * @param annotSetName is the name of the set in which paragraph annotation would be created.The * annotation type created will be "paragraph" */ public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException { // Simply return if the document is null or its content if (aDoc == null || aDoc.getContent() == null) return; // Simply return if the start is > than the end if (startOffset > endOffset) return; // Decide where to put the newly detected annotations AnnotationSet annotSet = null; if (annotSetName == null) annotSet = aDoc.getAnnotations(); else annotSet = aDoc.getAnnotations(annotSetName); // Extract the document content String content = aDoc.getContent().toString(); // This is the offset marking the start of a para int startOffsetPara = startOffset; // This marks the ned of a para int endOffsetPara = endOffset; // The initial sate of the FSA int state = 1; // This field marks that a BR entity was read // A BR entity can be NL or NL CR, depending on the operating system (UNIX // or DOS) boolean readBR = false; int index = startOffset; while (index < endOffset) { // Read the current char char ch = content.charAt(index); // Test if a BR entity was read if (ch == '\n') { readBR = true; // If \n is followed by a \r then advance the index in order to read a // BR entity while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++; } // End if switch (state) { // It is the initial and also a final state // Stay in state 1 while it reads whitespaces case 1: { // If reads a non whitespace char then move to state 2 and record // the beggining of a paragraph if (!Character.isWhitespace(ch)) { state = 2; startOffsetPara = index; } // End if } break; // It can be also a final state. case 2: { // Stay in state 2 while reading chars != BR entities if (readBR) { // If you find a BR char go to state 3. The possible end of the para // can be index. This will be confirmed by state 3. So, this is why // the end of a para is recorded here. readBR = false; endOffsetPara = index; state = 3; } // End if } break; // It can be also a final state // From state 3 there are only 2 possible ways: (state 2 or state1) // In state 1 it needs to read a BR // For state 2 it nead to read something different then a BR case 3: { if (readBR) { // A BR was read. Go to state 1 readBR = false; state = 1; // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException( "Coudn't create a paragraph" + " annotation", ioe); } // End try } else { // Go to state 2 an keep reading chars state = 2; } // End if } break; } // End switch // Prepare to read the next char. index++; } // End while endOffsetPara = index; // Investigate where the finite automata has stoped if (state == 2 || state == 3) { // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), // Create the final annotation using the endOffset new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe); } // End try } // End if } // End annotateParagraphs();