/** * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM * rather than re-loading the lists. */ @Override public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException { return Factory.createResource( SharedDefaultGazetteer.class.getName(), Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this), Factory.duplicate(this.getFeatures(), ctx), this.getName()); }
protected void doExecute(Document theDocument) throws ExecutionException { interrupted = false; if (theDocument == null) { throw new ExecutionException("No document to process!"); } AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet()); if (containingType == null || containingType.isEmpty()) { annotateText(document, outputAS, 0, document.getContent().size()); } else { AnnotationSet inputAS = null; if (inputASName == null || inputASName.isEmpty()) { inputAS = theDocument.getAnnotations(); } else { inputAS = theDocument.getAnnotations(inputASName); } AnnotationSet containingAnns = inputAS.get(containingType); for (Annotation containingAnn : containingAnns) { annotateText( document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn)); } } }
private static Document readDocument(String gateDocumentString) throws ResourceInstantiationException { Document gateDocument = (Document) Factory.createResource( "gate.corpora.DocumentImpl", Utils.featureMap( "stringContent", gateDocumentString, "mimeType", "text/xml", "encoding", "UTF-8")); return gateDocument; }
@Override public void execute() throws ExecutionException { Document doc = getDocument(); AnnotationSet as = doc.getAnnotations(getAnnotationSetName()); AnnotationSet tocs = as.get(getTokenAnnotationTypeName()); try { for (Annotation t : tocs) { String content = Utils.stringFor(doc, t); String val = getOrthographyValue(content); if (val != null) t.getFeatures().put("orth", val); } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Method is executed after the init() method has finished its execution. <br> * * @throws ExecutionException */ public void execute() throws ExecutionException { // lets start the progress and initialize the progress counter fireProgressChanged(0); // If no document provided to process throw an exception if (document == null) { fireProcessFinished(); throw new GateRuntimeException("No document to process!"); } // langugage ID feature Name if (languageIdFeatureName == null || languageIdFeatureName.trim().length() == 0) languageIdFeatureName = "lang"; /* Default behaviour: classify the text of the whole document and * store the result as a document feature. */ if ((annotationType == null) || (annotationType.length() == 0)) { String docText = document.getContent().toString(); Classification classification = classifier.classify(docText); document.getFeatures().put(languageIdFeatureName, classification.bestCategory()); } /* Optional behaviour: classify the text underlying each annotation * and store each results as an annotation feature. */ else { AnnotationSet annotations = document.getAnnotations(annotationSetName).get(annotationType); for (Annotation annotation : annotations) { String text = Utils.stringFor(document, annotation); Classification classification = classifier.classify(text); annotation.getFeatures().put(languageIdFeatureName, classification.bestCategory()); } } // process finished, acknowledge user about this. fireProcessFinished(); }
// carry out the actual annotations on the given span of text in the // document. protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) { String text = ""; try { text = doc.getContent().getContent(from, to).toString(); } catch (InvalidOffsetException ex) { throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to); } // send the text to the service and get back the response // System.out.println("Annotating text: "+text); // System.out.println("Starting offset is "+from); // NOTE: there is a bug in the TagMe service which causes offset errors // if we use the tweet mode and there are certain patterns in the tweet. // The approach recommended by Francesco Piccinno is to replace those // patterns by spaces. if (getIsTweet()) { logger.debug("Text before cleaning: >>" + text + "<<"); // replace text = text.replaceAll(patternStringRT3, " "); text = text.replaceAll(patternStringRT2, " "); text = text.replaceAll(patternHashTag, " $1"); // now replace the remaining patterns by spaces StringBuilder sb = new StringBuilder(text); Matcher m = patternUrl.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } m = patternUser.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } text = sb.toString(); logger.debug("Text after cleaning: >>" + text + "<<"); } TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text); for (TagMeAnnotation tagmeAnn : tagmeAnnotations) { if (tagmeAnn.rho >= minrho) { FeatureMap fm = Factory.newFeatureMap(); fm.put("tagMeId", tagmeAnn.id); fm.put("title", tagmeAnn.title); fm.put("rho", tagmeAnn.rho); fm.put("spot", tagmeAnn.spot); fm.put("link_probability", tagmeAnn.link_probability); if (tagmeAnn.title == null) { throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn); } else { fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title)); } try { gate.Utils.addAnn( outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm); } catch (Exception ex) { System.err.println( "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage()); ex.printStackTrace(System.err); System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn); } } } }