public void annotationUpdated(AnnotationEvent e) { if (e.getType() == AnnotationEvent.FEATURES_UPDATED) { if (!disableListener) { Annotation annot = (Annotation) e.getSource(); // lets find out which annotation set it belongs to AnnotationSet as = null; if (getAnnotations().contains(annot)) { as = getAnnotations(); } else { Map<String, AnnotationSet> ass = getNamedAnnotationSets(); if (ass == null) return; Iterator<String> namesIter = getNamedAnnotationSets().keySet().iterator(); while (namesIter.hasNext()) { String name = namesIter.next(); as = getNamedAnnotationSets().get(name); if (as.contains(annot)) { break; } else as = null; } } if (as == null) return; for (String docID : combinedDocumentIds) { OffsetDetails od = getOffsetDetails(docID, as.getName(), annot); if (od == null) continue; Annotation toUse = od.getOriginalAnnotation(); toUse.setFeatures(annot.getFeatures()); } } } }
private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID) throws ExecutionException { for (Annotation annot : toTransfer) { Mapping m = mappings.get(annot.getType()); String name = (m == null || m.newName == null ? annot.getType() : m.newName); try { FeatureMap params = Factory.newFeatureMap(); params.putAll(annot.getFeatures()); if (newID) { to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } else { to.add( annot.getId(), annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } } catch (InvalidOffsetException e) { throw new ExecutionException(e); } } }
@Override public void execute() throws ExecutionException { initBeforeExecute(); AnnotationSet tokensAndDependenciesAS = inputAS; TreeIndex index = new GateAwareTreeIndex( tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"}))); QueryData data = new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS)); Iterable<QueryMatch> results = queryObject.evaluate(data); int queryMatchOrd = 0; for (QueryMatch result : results) { queryMatchOrd++; for (NodeMatch match : result.getMatchingNodes()) { String name = match.getQueryNode().getName(); if (name != null) { Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId()); FeatureMap fm = Factory.newFeatureMap(); fm.put("matchingNodeId", match.getNodeId()); fm.put( "queryMatchId", String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd)); outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm); } } } }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(outputASName); List<Annotation> tokens = new ArrayList<Annotation>( document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE)); Collections.sort(tokens, new OffsetComparator()); String[] strings = new String[tokens.size()]; for (int i = 0; i < tokens.size(); ++i) { strings[i] = (String) tokens.get(i).getFeatures().get("string"); } try { TagList tags = tagger.tag(strings); Iterator<Tag> it = tags.iterator(); while (it.hasNext()) { Tag tag = it.next(); outputAS.add( tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(), tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(), tag.getTagname(), Factory.newFeatureMap()); } } catch (Exception ioe) { throw new ExecutionException("Tagger Failed", ioe); } }
public void tokenize() { AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) { Annotation currentTokenAnnotation = it.next(); FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures(); FeatureMap curFeaturesMap = Factory.newFeatureMap(); if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) { curFeaturesMap.put("string", tokenFeaturesMap.get("string")); curFeaturesMap.put("root", tokenFeaturesMap.get("lemma")); curFeaturesMap.put("category", tokenFeaturesMap.get("POS")); // Add the new Token to the Annotation Set defaultAs.add( currentTokenAnnotation.getStartNode(), currentTokenAnnotation.getEndNode(), currentTokenAnnotation.getType(), curFeaturesMap); } } gateDocument.removeAnnotationSet("Tokenization"); }
/** Annotation remove event */ public void annotationRemoved(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); // find out the details which refer to the deleted annotation OffsetDetails od = getOffsetDetails(docID, as.getName(), annot); if (od == null) continue; if (defaultAS) { aDoc.getAnnotations().remove(od.getOriginalAnnotation()); } else { aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation()); } removeOffsetDetails(docID, od); break; } } }
// generate annotations for ngrams over a larger span e.g all couples inside // a span of 5 tokens // this allows to match more variants e.g. with adjectives in the middle // we do not generate intermediate annotations here // do with only bigrams for the moment private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; // create a temporary list containing all the annotations // at position 0 List<Annotation> headannots = boxes.get(b); for (Annotation newAnn : headannots) { // remembering positions loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String string = (String) newAnn.getFeatures().get(inputAnnotationFeature); tempAnnotationsStartingHere.add(string); if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, string); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } for (int z = 1; z < window && (b + z < boxes.size()); z++) { // generate all possible bi-grams List<Annotation> current = boxes.get(b + z); for (Annotation newAnn : current) { // remembering positions if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // take what is in the buffer // and make a new annotation out of that for (String s : tempAnnotationsStartingHere) { String combination = s + getNgramSeparator() + newString; // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
private void generateNGrams(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { // now do the actual n-grams for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) { // do the combination and dump what we've done at every step // e.g generate 1 grams as well as 2-grams List<Annotation> current = boxes.get(b + z); List<String> temptemp = new ArrayList<String>(); for (Annotation newAnn : current) { // remembering positions if (loStart == null) loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // TODO : what if there is no such value???? if (tempAnnotationsStartingHere.size() == 0) { // create an annotation for the current annotation if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, newString); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } // add it to the temp temptemp.add(newString); } else for (String existing : tempAnnotationsStartingHere) { String combination = existing + getNgramSeparator() + newString; temptemp.add(combination); if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) { // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } tempAnnotationsStartingHere = temptemp; } } } catch (Exception e) { throw new ExecutionException(e); } }
protected AnnotationDiffer calculateDocumentDiff(Document document, String annotTypeName) { AnnotationSet responsesIter = responseAS.get(annotTypeName); if (getKeyAnnotationsAreInDocumentFeatures()) { return DocumentFeaturesDiff.computeDiffWithDocFeatures(document, featureNames, responsesIter); } AnnotationSet keysIter = keyAS.get(annotTypeName); AnnotationDiffer differ = new AnnotationDiffer(); differ.setSignificantFeaturesSet(new HashSet<String>(featureNames)); differ.calculateDiff(keysIter, responsesIter); // compare return differ; }
/** Annotation added event */ public void annotationAdded(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); annot.addAnnotationListener(this); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue()); if (stOffset == -1) continue; long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue()); if (enOffset == -1) continue; Annotation originalAnnot = null; try { Integer id = annot.getId(); if (defaultAS) { aDoc.getAnnotations() .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations().get(id); } else { aDoc.getAnnotations(as.getName()) .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations(as.getName()).get(id); } } catch (InvalidOffsetException ioe) { System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset); throw new GateRuntimeException(ioe); } OffsetDetails od = new OffsetDetails(); od.setOldStartOffset(stOffset); od.setOldEndOffset(enOffset); od.setNewStartOffset(annot.getStartNode().getOffset().longValue()); od.setNewEndOffset(annot.getEndNode().getOffset().longValue()); od.setOriginalAnnotation(originalAnnot); od.setNewAnnotation(annot); addNewOffsetDetails(docID, od); break; } } }
public void splitter() { AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) { Annotation currentSentenceAnnotation = it.next(); // Add the Sentence to the Annotation Set defaultAs.add( currentSentenceAnnotation.getStartNode(), currentSentenceAnnotation.getEndNode(), "Sentence", null); } gateDocument.removeAnnotationSet("SentenceDetection"); }
@Override public void execute() throws ExecutionException { Document doc = getDocument(); AnnotationSet as = doc.getAnnotations(getAnnotationSetName()); AnnotationSet tocs = as.get(getTokenAnnotationTypeName()); try { for (Annotation t : tocs) { String content = Utils.stringFor(doc, t); String val = getOrthographyValue(content); if (val != null) t.getFeatures().put("orth", val); } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Sub-range access for annotation sets (mapping to getContained). Allows <code> * someAnnotationSet[15..20]</code>. This works with ranges whose end points are any numeric type, * so as well as using integer literals you can do <code>someAnnotationSet[ann.start()..ann.end()] * </code> (as start and end return Long). * * @see AnnotationSet#getContained(Long, Long) */ @SuppressWarnings("unchecked") public static AnnotationSet getAt(AnnotationSet self, Range<?> range) { if (range.getFrom() instanceof Number) { return self.getContained( Long.valueOf(((Number) range.getFrom()).longValue()), Long.valueOf(((Number) range.getTo()).longValue())); } else if (range.getFrom() instanceof String) { return getAt(self, (List<String>) range); } else { throw new IllegalArgumentException("AnnotationSet.getAt expects a numeric or string range"); } }
/** * Two AnnotationSet are equal if their name, the documents of which belong to the AnnotationSets * and annotations from the sets are the same */ public static boolean annotationSetsEqual(AnnotationSet as1, AnnotationSet as2) { if (as1 == null ^ as2 == null) return false; if (as1 == null) return true; // Sets equality if (as1.size() != as2.size()) return false; try { if (!as1.containsAll(as2)) return false; } catch (ClassCastException unused) { return false; } catch (NullPointerException unused) { return false; } // removed to prevent infinite looping in testDocumentsEqual() // // verify the documents which they belong to // if (! check (as1.getDocument(), as2.getDocument())) return false; // verify the name of the AnnotationSets if (!check(as1.getName(), as2.getName())) return false; return true; } // equals
protected void doExecute(Document theDocument) throws ExecutionException { interrupted = false; if (theDocument == null) { throw new ExecutionException("No document to process!"); } AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet()); if (containingType == null || containingType.isEmpty()) { annotateText(document, outputAS, 0, document.getContent().size()); } else { AnnotationSet inputAS = null; if (inputASName == null || inputASName.isEmpty()) { inputAS = theDocument.getAnnotations(); } else { inputAS = theDocument.getAnnotations(inputASName); } AnnotationSet containingAnns = inputAS.get(containingType); for (Annotation containingAnn : containingAnns) { annotateText( document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn)); } } }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(annotationSetName); String text = document.getContent().toString(); Span[] tokens = tokenizer.getTokens(text); try { for (Span token : tokens) { FeatureMap features = Factory.newFeatureMap(); features.put( ANNIEConstants.TOKEN_STRING_FEATURE_NAME, text.substring(token.getStart(), token.getEnd())); outputAS.add( (long) token.getStart(), (long) token.getEnd(), ANNIEConstants.TOKEN_ANNOTATION_TYPE, features); } } catch (Exception e) { throw new ExecutionException("error running tokenizer", e); } }
public void doit( gate.Document doc, Map<String, AnnotationSet> bindings, gate.AnnotationSet annotations, gate.AnnotationSet inputAS, gate.AnnotationSet outputAS, gate.creole.ontology.Ontology ontology) throws JapeException { // your RHS Java code will be embedded here ... String category = (String) doc.getFeatures().get("category"); Annotation mention = tagAnnots.iterator().next(); String pName = (String) mention.getFeatures().get("class"); OntoManager om = OntoManager.getInstance(); ProductClass catClass = om.getProductClass(category); ProductClass pClass = om.getProductClass(pName); boolean sameCat = pClass .getOntClass() .hasSuperClass(catClass.getOntClass()); // TODO: refactor to used direct method if (!sameCat) { outputAS.remove(mention); } }
/** * @param inputAS input annotation set * @param outputAS output annotation set * @param term String matched * @param startOffset match start offset * @param endOffset match end offset */ private void addLookup( AnnotationSet inputAS, AnnotationSet outputAS, String term, String outputASType, Long startOffset, Long endOffset, boolean useNounChunk) { if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) { AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset); if (!nounChunkAS.isEmpty()) { startOffset = nounChunkAS.firstNode().getOffset(); endOffset = nounChunkAS.lastNode().getOffset(); } } try { AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset); if (diseaseAS.isEmpty()) { FeatureMap fm = Factory.newFeatureMap(); fm.put("match", term); outputAS.add(startOffset, endOffset, outputASType, fm); } else { Annotation disease = diseaseAS.iterator().next(); FeatureMap fm = disease.getFeatures(); String meta = (String) fm.get("match"); if (meta != null) { meta = meta + " " + term; } fm.put("match", meta); } } catch (InvalidOffsetException ie) { // shouldn't happen gate.util.Err.println(ie); } }
/** Returns a list of annotations to be added to the Behemoth document from the GATE one * */ private List<com.digitalpebble.behemoth.Annotation> convertGATEAnnotationsToBehemoth( AnnotationSet GATEAnnotionSet, com.digitalpebble.behemoth.BehemothDocument behemoth) { List<com.digitalpebble.behemoth.Annotation> beheannotations = new ArrayList<com.digitalpebble.behemoth.Annotation>(); AnnotationSet resultAS = GATEAnnotionSet.get(filters.getTypes()); // sort the GATE annotations List<gate.Annotation> annotationList = new ArrayList<gate.Annotation>(resultAS); Collections.sort(annotationList, new OffsetComparator()); Iterator<gate.Annotation> inputASIter = annotationList.iterator(); while (inputASIter.hasNext()) { gate.Annotation source = inputASIter.next(); com.digitalpebble.behemoth.Annotation target = new com.digitalpebble.behemoth.Annotation(); target.setType(source.getType()); target.setStart(source.getStartNode().getOffset().longValue()); target.setEnd(source.getEndNode().getOffset().longValue()); // now do the features // is the type listed? Set<String> expectedfeatnames = filters.getFeatureFilters().get(source.getType()); if (expectedfeatnames != null) { Iterator featurenames = source.getFeatures().keySet().iterator(); while (featurenames.hasNext()) { // cast the feature name to a string which will be right in // 99% of cases String featurename = featurenames.next().toString(); // if this feature name is not wanted just ignore it if (expectedfeatnames.contains(featurename) == false) continue; // we know that we want to keep this feature // let's see what the best way of representing the value // would be // TODO later => find a better way of mapping when not a // string Object originalvalue = source.getFeatures().get(featurename); if (originalvalue == null) originalvalue = "null"; target.getFeatures().put(featurename, originalvalue.toString()); } } beheannotations.add(target); } return beheannotations; }
/** * Creates the Lookup annotations according to a gazetteer match. * * @param matchingState the final FSMState that was reached while matching. * @param matchedRegionStart the start of the matched text region. * @param matchedRegionEnd the end of the matched text region. * @param annotationSet the annotation set where the new annotations should be added. */ protected void createLookups( FSMState matchingState, long matchedRegionStart, long matchedRegionEnd, AnnotationSet annotationSet) { Iterator lookupIter = matchingState.getLookupSet().iterator(); while (lookupIter.hasNext()) { Lookup currentLookup = (Lookup) lookupIter.next(); FeatureMap fm = Factory.newFeatureMap(); fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); if (null != currentLookup.oClass && null != currentLookup.ontology) { fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass); fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology); } if (null != currentLookup.minorType) fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); if (null != currentLookup.languages) fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages); if (null != currentLookup.features) { fm.putAll(currentLookup.features); } try { // if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){ // annotationSet.add(new Long(matchedRegionStart), // new Long(matchedRegionEnd + 1), // LOOKUP_ANNOTATION_TYPE, // fm); // }else{ annotationSet.add( new Long(matchedRegionStart), new Long(matchedRegionEnd + 1), currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag. fm); // } } catch (InvalidOffsetException ioe) { throw new GateRuntimeException(ioe.toString()); } } // while(lookupIter.hasNext()) }
/** * Rename annotation * * @param outputAS output annotation set * @param oldType old annotation name * @param newType new annotation name */ private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) { AnnotationSet tmpAnatomyAS = outputAS.get(oldType); for (Annotation tmpAnn : tmpAnatomyAS) { Long startOffset = tmpAnn.getStartNode().getOffset(); Long endOffset = tmpAnn.getEndNode().getOffset(); AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset); // If we've already got an annotation of the same name in the same place, don't add a new one // just delete the old one if (existingAS.isEmpty()) { FeatureMap tmpFm = tmpAnn.getFeatures(); FeatureMap fm = Factory.newFeatureMap(); fm.putAll(tmpFm); try { outputAS.add(startOffset, endOffset, newType, fm); outputAS.remove(tmpAnn); } catch (InvalidOffsetException ie) { // shouldn't happen } } else { outputAS.remove(tmpAnn); } } }
public void execute() throws ExecutionException { // get all the annotations we need from the input AS AnnotationSet inputAS = inputAnnotationSet == null || inputAnnotationSet.trim().length() == 0 ? document.getAnnotations() : document.getAnnotations(inputAnnotationSet); AnnotationSet outputAS = outputAnnotationSet == null || outputAnnotationSet.trim().length() == 0 ? document.getAnnotations() : document.getAnnotations(outputAnnotationSet); // no spans? if (getSpanAnnotationType() == null | getSpanAnnotationType().equals("")) { AnnotationSet inputs = inputAS.get(inputAnnotationType); List<Annotation> list = new ArrayList<Annotation>(); list.addAll(inputs); Collections.sort(list, new OffsetComparator()); // use window or normal if (window == -1) generateNGrams(list, outputAS); else generateNGramsOverWindow(list, outputAS); } else { // use the spans AnnotationSet spans = inputAS.get(getSpanAnnotationType()); Iterator spaniter = spans.iterator(); while (spaniter.hasNext()) { Annotation span = (Annotation) spaniter.next(); AnnotationSet inputs = inputAS.get( inputAnnotationType, span.getStartNode().getOffset(), span.getEndNode().getOffset()); List<Annotation> list = new ArrayList<Annotation>(); list.addAll(inputs); Collections.sort(list, new OffsetComparator()); if (window == -1) generateNGrams(list, outputAS); else generateNGramsOverWindow(list, outputAS); } } }
public void execute() throws ExecutionException { // get the sentence splitter file from the URL provided File splitter = Files.fileFromURL(splitterBinary); // get the document content and replace non-breaking spaces with spaces // TODO replace new-lines with spaces so we don't get a sentence per line String docContent = document.getContent().toString().replace((char) 160, ' '); try { // create temporary files to use with the external sentence splitter File tmpIn = File.createTempFile("GENIA", ".txt"); File tmpOut = File.createTempFile("GENIA", ".txt"); // store the document content in the input file FileOutputStream fos = new FileOutputStream(tmpIn); fos.write(docContent.getBytes("utf8")); fos.close(); // setup the command line to run the sentence splitter String[] args = new String[] { splitter.getAbsolutePath(), tmpIn.getAbsolutePath(), tmpOut.getAbsolutePath() }; // run the sentence splitter over the docuement manager.runProcess( args, splitter.getParentFile(), (debug ? System.out : null), (debug ? System.err : null)); // get the annotation set we are going to store results in AnnotationSet annotationSet = document.getAnnotations(annotationSetName); // we haven't found any sentence yet so start looking for the next one // from the beginning of the document int end = 0; // read in the output from the sentence splitter one line at a time BufferedReader in = new BufferedReader(new FileReader(tmpOut)); String sentence = in.readLine(); while (sentence != null) { // trim the sentence so we don't annotate extranious white space, // this isn't python code after all :) sentence = sentence.trim(); // find the start of the sentence // TODO throw a sensible exception if the sentence can't be found? int start = docContent.indexOf(sentence, end); // work out where the sentence ends end = start + sentence.length(); if (end > start) { // the sentence has a length so annotate it annotationSet.add((long) start, (long) end, "Sentence", Factory.newFeatureMap()); } // get the next line from the output from the tagger sentence = in.readLine(); } // delete the temp files if (!debug && !tmpIn.delete()) tmpIn.deleteOnExit(); if (!debug && !tmpOut.delete()) tmpOut.deleteOnExit(); } catch (Exception ioe) { throw new ExecutionException("An error occured running the splitter", ioe); } }
/** * The main entry point. First we parse the command line options (see usage() method for details), * then we take all remaining command line parameters to be file names to process. Each file is * loaded, processed using the application and the results written to the output file * (inputFile.out.xml). */ public static void main(String[] args) throws Exception { parseCommandLine(args); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(gappFile); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. ArrayList<String> files = getFilesFromDir(inputDir); gate.Corpus corpus = createCorpus(files); // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); System.out.println("Processing " + files.size() + " files"); // process the files one by one for (int i = 0; i < files.size(); i++) { // load the document (using the specified encoding if one was given) File docFile = new File(files.get(i)); System.out.print("Processing document " + docFile + " (" + i + ") ..."); Document doc = Factory.newDocument(docFile.toURL(), encoding); // put the document in the corpus corpus.add(doc); // run the application application.execute(); // remove the document from the corpus again corpus.clear(); String docXMLString = null; // if we want to just write out specific annotation types, we must // extract the annotations into a Set if (annotTypesToWrite != null) { // Create a temporary Set to hold the annotations we wish to write out Set annotationsToWrite = new HashSet(); // we only extract annotations from the default (unnamed) AnnotationSet // in this example AnnotationSet defaultAnnots = doc.getAnnotations("Output"); Iterator annotTypesIt = annotTypesToWrite.iterator(); while (annotTypesIt.hasNext()) { // extract all the annotations of each requested type and add them to // the temporary set AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next()); if (annotsOfThisType != null) { annotationsToWrite.addAll(annotsOfThisType); } } // create the XML string using these annotations docXMLString = doc.toXml(annotationsToWrite, true); } // otherwise, just write out the whole document as GateXML else { docXMLString = doc.toXml(); } // Release the document, as it is no longer needed Factory.deleteResource(doc); // output the XML to <inputFile>.out.xml System.out.println("Writing file " + docFile.getName()); String outputFileName = docFile.getName() + ".out.xml"; // File outputFile = new File(docFile.getParentFile(), outputFileName); File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName); // Write output files using the same encoding as the original FileOutputStream fos = new FileOutputStream(outputFile); BufferedOutputStream bos = new BufferedOutputStream(fos); OutputStreamWriter out; if (encoding == null) { out = new OutputStreamWriter(bos); } else { out = new OutputStreamWriter(bos, encoding); } out.write(docXMLString); out.close(); System.out.println("done"); } // for each file System.out.println("All done"); } // void main(String[] args)
@Override public void execute() throws ExecutionException { interrupted = false; // quit if setup failed if (gracefulExit) { gracefulExit("Plugin was not initialised correctly. Exiting gracefully ... "); return; } AnnotationSet inputAS = (inputASName == null || inputASName.trim().length() == 0) ? document.getAnnotations() : document.getAnnotations(inputASName); AnnotationSet outputAS = (outputASName == null || outputASName.trim().length() == 0) ? document.getAnnotations() : document.getAnnotations(outputASName); AnnotationSet sentenceAS = null; if (sentenceType != null && !sentenceType.isEmpty()) { sentenceAS = inputAS.get(sentenceType); } // Document content String docContent = document.getContent().toString(); int docLen = docContent.length(); // For matching purposes replace all whitespace characters with a single space docContent = docContent.replaceAll("[\\s\\xA0\\u2007\\u202F]", " "); fireStatusChanged("Locating anatomy, disease and procedure mentions in " + document.getName()); fireProgressChanged(0); if (sentenceAS != null) { for (Annotation sentence : sentenceAS) { Long sentStartOffset = sentence.getStartNode().getOffset(); Long sentEndOffset = sentence.getEndNode().getOffset(); // Converting the sentence to lower case prevents the need to use case-insenstive regex // matching, which should give a small performance boost String sentenceContent = docContent .substring(sentStartOffset.intValue(), sentEndOffset.intValue()) .toLowerCase(Locale.ENGLISH); if (diseaseType != null && !diseaseType.isEmpty()) { doMatch( patternMap.get("disease_suffix"), sentenceContent, inputAS, outputAS, "suffDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_abbrevs"), sentenceContent, inputAS, outputAS, "preDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_named_syndrome"), sentenceContent, inputAS, outputAS, "namedDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_sense"), sentenceContent, inputAS, outputAS, "tmpDiseaseSense", sentStartOffset, docLen); doMatch( patternMap.get("disease_sense_context"), sentenceContent, inputAS, outputAS, "tmpDiseaseSenseContext", sentStartOffset, docLen); doMatch( patternMap.get("disease_generic_context"), sentenceContent, inputAS, outputAS, "poDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_anatomy_context"), sentenceContent, inputAS, outputAS, "tmpDisease", sentStartOffset, docLen); } if (procedureType != null && !procedureType.isEmpty()) { doMatch( patternMap.get("procedure_suffix"), sentenceContent, inputAS, outputAS, "poProcedure", sentStartOffset, docLen); doMatch( patternMap.get("procedure_key"), sentenceContent, inputAS, outputAS, "poProcedure", sentStartOffset, docLen); doMatch( patternMap.get("procedure_anatomy_context"), sentenceContent, inputAS, outputAS, "tmpProcedure", sentStartOffset, docLen); } if (symptomType != null && !symptomType.isEmpty()) { doMatch( patternMap.get("symptom_key"), sentenceContent, inputAS, outputAS, "poSymptom", sentStartOffset, docLen); } if (testType != null && !testType.isEmpty()) { doMatch( patternMap.get("test_key"), sentenceContent, inputAS, outputAS, "poTest", sentStartOffset, docLen); } if (anatomyType != null && !anatomyType.isEmpty()) { doMatch( patternMap.get("anatomy_suffix_adjective"), sentenceContent, inputAS, outputAS, "tmpAnatSuffAdj", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_suffix"), sentenceContent, inputAS, outputAS, "tmpAnatSuff", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_prefix"), sentenceContent, inputAS, outputAS, "tmpAnatPre", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_position"), sentenceContent, inputAS, outputAS, "tmpAnatPos", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_space_region_junction"), sentenceContent, inputAS, outputAS, "tmpAnatSpace", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_part_adjective"), sentenceContent, inputAS, outputAS, "tmpAnatAdj", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_latin_noun"), sentenceContent, inputAS, outputAS, "tmpAnatLatin", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_muscle"), sentenceContent, inputAS, outputAS, "tmpAnatMuscle", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_part"), sentenceContent, inputAS, outputAS, "tmpAnatPart", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_fluid"), sentenceContent, inputAS, outputAS, "tmpAnatFluid", sentStartOffset, docLen); } } // Run JAPE transducer to clean up the output fireStatusChanged( "Processing anatomical, disease and procedure mentions in " + document.getName()); try { japeTransducer.setDocument(document); japeTransducer.setInputASName(inputASName); japeTransducer.setOutputASName(outputASName); japeTransducer.addProgressListener(this); japeTransducer.execute(); } catch (ExecutionException re) { gate.util.Err.println("Unable to run " + japeURL); gracefulExit = true; } finally { japeTransducer.setDocument(null); } // rename temporary annotations if (!debug) { renameAnnotations(outputAS, "tmpAnatomicalTerm", anatomyType); renameAnnotations(outputAS, "suffDisease", diseaseType); renameAnnotations(outputAS, "poDisease", diseaseType); renameAnnotations(outputAS, "preDisease", diseaseType); renameAnnotations(outputAS, "poProcedure", procedureType); renameAnnotations(outputAS, "poSymptom", symptomType); renameAnnotations(outputAS, "poTest", testType); } } else { gracefulExit("No sentences to process!"); } // want list of disease key words plus symptoms such as oedema? or just diseases fireProcessFinished(); } // end execute()
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main
@Override public void execute() throws ExecutionException { AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); AnnotationSet tagAS = document.getAnnotations(tagASName); AnnotationSet annotsToTransfer = null; boolean newID = copyAnnotations && inputAS.equals(outputAS); mappings.clear(); // TODO clean this up so we don't have to repeat ourselves if (configURL != null) { BufferedReader in = null; try { in = new BomStrippingInputStreamReader(configURL.openStream()); String line = in.readLine(); while (line != null) { if (!line.trim().equals("")) { String[] data = line.split("=", 2); String oldName = data[0].trim(); String newName = data.length == 2 ? data[1].trim() : null; mappings.put(oldName, new Mapping(oldName, newName)); } line = in.readLine(); } } catch (IOException ioe) { ioe.printStackTrace(); } finally { IOUtils.closeQuietly(in); } } else if (annotationTypes != null) { for (String type : annotationTypes) { String[] data = type.split("=", 2); String oldName = data[0].trim(); String newName = data.length == 2 ? data[1].trim() : null; mappings.put(oldName, new Mapping(oldName, newName)); } } // else // throw new // ExecutionException("The annotation list and URL cannot both be null"); if (mappings.size() > 0) { annotsToTransfer = inputAS.get(mappings.keySet()); } else { // transfer everything annotsToTransfer = inputAS.get(); } // in case of no one annotation from some of annotationTypes if (annotsToTransfer == null || annotsToTransfer.size() == 0) return; // check if we have a BODY annotation // if not, just copy all if (textTagName == null || textTagName.equals("")) { // remove from input set unless we copy only if (!copyAnnotations) inputAS.removeAll(annotsToTransfer); transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID); return; } // get the BODY annotation bodyAnnotations = tagAS.get(textTagName); if (bodyAnnotations == null || bodyAnnotations.isEmpty()) { // outputAS.addAll(inputAS); if (transferAllUnlessFound) { // remove from input set unless we copy only if (!copyAnnotations) inputAS.removeAll(annotsToTransfer); transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID); } return; } List<Annotation> annots2Move = new ArrayList<Annotation>(); Iterator<Annotation> bodyIter = bodyAnnotations.iterator(); while (bodyIter.hasNext()) { Annotation bodyAnn = bodyIter.next(); Long start = bodyAnn.getStartNode().getOffset(); Long end = bodyAnn.getEndNode().getOffset(); // get all annotations we want transferred AnnotationSet annots2Copy = annotsToTransfer.getContained(start, end); // copy them to the new set and delete them from the old one annots2Move.addAll(annots2Copy); } if (!copyAnnotations) inputAS.removeAll(annots2Move); transferAnnotations(annots2Move, outputAS, newID); }
/** * This method annotates paragraphs in a GATE document. The investigated text spans beetween start * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName * is null then they are creted in the default annotation set. * * @param aDoc is the gate document on which the paragraph detection would be performed.If it is * null or its content it's null then the method woul simply return doing nothing. * @param startOffset is the index form the document content from which the paragraph detection * will start * @param endOffset is the offset where the detection will end. * @param annotSetName is the name of the set in which paragraph annotation would be created.The * annotation type created will be "paragraph" */ public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException { // Simply return if the document is null or its content if (aDoc == null || aDoc.getContent() == null) return; // Simply return if the start is > than the end if (startOffset > endOffset) return; // Decide where to put the newly detected annotations AnnotationSet annotSet = null; if (annotSetName == null) annotSet = aDoc.getAnnotations(); else annotSet = aDoc.getAnnotations(annotSetName); // Extract the document content String content = aDoc.getContent().toString(); // This is the offset marking the start of a para int startOffsetPara = startOffset; // This marks the ned of a para int endOffsetPara = endOffset; // The initial sate of the FSA int state = 1; // This field marks that a BR entity was read // A BR entity can be NL or NL CR, depending on the operating system (UNIX // or DOS) boolean readBR = false; int index = startOffset; while (index < endOffset) { // Read the current char char ch = content.charAt(index); // Test if a BR entity was read if (ch == '\n') { readBR = true; // If \n is followed by a \r then advance the index in order to read a // BR entity while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++; } // End if switch (state) { // It is the initial and also a final state // Stay in state 1 while it reads whitespaces case 1: { // If reads a non whitespace char then move to state 2 and record // the beggining of a paragraph if (!Character.isWhitespace(ch)) { state = 2; startOffsetPara = index; } // End if } break; // It can be also a final state. case 2: { // Stay in state 2 while reading chars != BR entities if (readBR) { // If you find a BR char go to state 3. The possible end of the para // can be index. This will be confirmed by state 3. So, this is why // the end of a para is recorded here. readBR = false; endOffsetPara = index; state = 3; } // End if } break; // It can be also a final state // From state 3 there are only 2 possible ways: (state 2 or state1) // In state 1 it needs to read a BR // For state 2 it nead to read something different then a BR case 3: { if (readBR) { // A BR was read. Go to state 1 readBR = false; state = 1; // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException( "Coudn't create a paragraph" + " annotation", ioe); } // End try } else { // Go to state 2 an keep reading chars state = 2; } // End if } break; } // End switch // Prepare to read the next char. index++; } // End while endOffsetPara = index; // Investigate where the finite automata has stoped if (state == 2 || state == 3) { // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), // Create the final annotation using the endOffset new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe); } // End try } // End if } // End annotateParagraphs();
private RetObj ProcessRecords() throws Exception { // Create a Corpus to use. We recycle the same Corpus object for each // iteration. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); this.application.setCorpus(corpus); // object for returned data List<String> processedlines = new ArrayList<String>(); List<String> processedText = new ArrayList<String>(); for (int record_num = 0; record_num < this.recs.size(); ++record_num) { /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0) System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete."); */ // first, split title from body and get embedded age in title.. String title_age = "-1"; String sep = "..THIS IS MY SEPARATION STRING.."; String title = ""; String body = this.recs.get(record_num); Boolean trimmed = false; int age_end = body.indexOf(",> "); if (age_end >= 0 && age_end < body.length()) { int age_start = body.lastIndexOf("-", age_end); if (age_start >= 0 && age_start < age_end) { title_age = body.substring(age_start + 1, age_end).trim(); if (!isInteger(title_age)) title_age = "-1"; else { title = body.substring(0, age_start); body = body.substring(age_end + 2, body.length()); body = title + sep + body; trimmed = true; } } if (!trimmed) { title = body.substring(0, age_end); body = body.substring(age_end + 2, body.length()); body = title + sep + body; trimmed = true; } } // -------------------- org.jsoup.nodes.Document htmldoc = Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ",")); Elements links = htmldoc.select("a[href]"); Elements media = htmldoc.select("[src]"); Elements imports = htmldoc.select("link[href]"); processedText.add(htmldoc.text().replace(sep, " ")); Document doc = Factory.newDocument(htmldoc.text()); // put the document in the corpus corpus.add(doc); // run the application this.application.execute(); // remove the document from the corpus again corpus.clear(); // extract annotations String line = ""; AnnotationSet Annots = doc.getAnnotations(""); Integer FirstPersonCount = 0, ThirdPersonCount = 0; AnnotationSet FirstPerson = Annots.get("FirstPerson"); if (FirstPerson != null) FirstPersonCount = FirstPerson.size(); AnnotationSet ThirdPerson = Annots.get("ThirdPerson"); if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size(); line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ","; AnnotationSet Names = Annots.get("Name"); if (Names == null || Names.size() < 1) line += ","; else { Iterator<Annotation> Iter = Names.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("name"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Age = Annots.get("Age"); if (Age == null || Age.size() < 1) line += title_age + ","; else { Iterator<Annotation> Iter = Age.inDocumentOrder().iterator(); line += title_age + ";"; while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("age"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Cost = Annots.get("Cost"); if (Cost == null || Cost.size() < 1) line += ","; else { Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("value"); if (Feat != null) line += Feat.toString(); else line += "none"; line += "/"; Feat = Ann.getFeatures().get("target_value"); if (Feat != null) line += Feat.toString(); else line += "none"; line += "/"; Feat = Ann.getFeatures().get("target_type"); if (Feat != null) line += Feat.toString(); else line += "none"; if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet height = Annots.get("height"); if (height == null || height.size() < 1) line += ",,"; else { String ft = ""; String inch = ""; Iterator<Annotation> Iter = height.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("feet"); if (Feat != null) ft += Feat.toString(); else ft += "none"; Feat = Ann.getFeatures().get("inches"); if (Feat != null) inch += Feat.toString(); else inch += "none"; if (Iter.hasNext()) { ft += ";"; inch += ";"; } } line += ft + "," + inch + ","; } AnnotationSet weight = Annots.get("weight"); if (weight == null || weight.size() < 1) line += ","; else { Iterator<Annotation> Iter = weight.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("pounds"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet measurement = Annots.get("measurement"); if (measurement == null || measurement.size() < 1) line += ",,,,"; else { String cup = ""; String chest = ""; String waist = ""; String hip = ""; Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("cup"); if (Feat != null) cup += Feat.toString(); else cup += "none"; Feat = Ann.getFeatures().get("chest"); if (Feat != null) chest += Feat.toString(); else chest += "none"; Feat = Ann.getFeatures().get("waist"); if (Feat != null) waist += Feat.toString(); else waist += "none"; Feat = Ann.getFeatures().get("hip"); if (Feat != null) hip += Feat.toString(); else hip += "none"; if (Iter.hasNext()) { cup += ";"; chest += ";"; waist += ";"; hip += ";"; } } line += cup + "," + chest + "," + waist + "," + hip + ","; } AnnotationSet Ethnicity = Annots.get("Ethnicity"); if (Ethnicity == null || Ethnicity.size() < 1) line += ","; else { Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("ethnicity"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet SkinColor = Annots.get("SkinColor"); if (SkinColor == null || SkinColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet EyeColor = Annots.get("EyeColor"); if (EyeColor == null || EyeColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet HairColor = Annots.get("HairColor"); if (HairColor == null || HairColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Restriction = Annots.get("Restriction"); if (Restriction == null || Restriction.size() < 1) line += ",,,"; else { String type = ""; String ethnicity = ""; String age = ""; Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("type"); if (Feat != null) type += Feat.toString(); else type += "none"; Feat = Ann.getFeatures().get("ethnicity"); if (Feat != null) ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); else ethnicity += "none"; Feat = Ann.getFeatures().get("age"); if (Feat != null) age += Feat.toString(); else age += "none"; if (Iter.hasNext()) { type += ";"; ethnicity += ";"; age += ";"; } } line += type + "," + ethnicity + "," + age + ","; } AnnotationSet Phone = Annots.get("PhoneNumber"); if (Phone == null || Phone.size() < 1) line += ",,,"; else { String value = ""; String state = ""; String city = ""; Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("value"); if (Feat != null) value += Feat.toString(); else value += "none"; Feat = Ann.getFeatures().get("state"); if (Feat != null) state += Feat.toString(); else state += "none"; Feat = Ann.getFeatures().get("area"); if (Feat != null) city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); else city += "none"; if (Iter.hasNext()) { value += ";"; state += ";"; city += ";"; } } line += value + "," + state + "," + city + ","; } String Emails = ""; AnnotationSet Email = Annots.get("Email"); if (Email == null || Email.size() < 1) Emails = ""; else { Iterator<Annotation> Iter = Email.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("email"); if (Feat != null) Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (links != null) { for (Element l : links) { String href = l.attr("abs:href"); if (href == null) continue; if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) { Emails += href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";"; } } } if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";")) Emails = Emails.substring(0, Emails.length() - 1); line += Emails + ","; String Urls = ""; AnnotationSet Url = Annots.get("Url"); if (Url == null || Url.size() < 1) Urls = ""; else { Iterator<Annotation> Iter = Url.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("url"); if (Feat != null) Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (links != null) { for (Element l : links) { String href = l.attr("abs:href"); if (href == null) continue; if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) { Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } } if (imports != null) { for (Element l : imports) { String href = l.attr("abs:href"); if (href == null) continue; Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";")) Urls = Urls.substring(0, Urls.length() - 1); line += Urls + ","; String Medias = ""; if (media != null) { for (Element l : media) { String src = l.attr("abs:src"); if (src == null) continue; Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";")) Medias = Medias.substring(0, Medias.length() - 1); line += Medias; processedlines.add(line); // Release the document, as it is no longer needed Factory.deleteResource(doc); } Factory.deleteResource(corpus); RetObj out = new RetObj(processedlines, processedText); return out; }