Пример #1
0
 /**
  * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM
  * rather than re-loading the lists.
  */
 @Override
 public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException {
   return Factory.createResource(
       SharedDefaultGazetteer.class.getName(),
       Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this),
       Factory.duplicate(this.getFeatures(), ctx),
       this.getName());
 }
 protected void doExecute(Document theDocument) throws ExecutionException {
   interrupted = false;
   if (theDocument == null) {
     throw new ExecutionException("No document to process!");
   }
   AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet());
   if (containingType == null || containingType.isEmpty()) {
     annotateText(document, outputAS, 0, document.getContent().size());
   } else {
     AnnotationSet inputAS = null;
     if (inputASName == null || inputASName.isEmpty()) {
       inputAS = theDocument.getAnnotations();
     } else {
       inputAS = theDocument.getAnnotations(inputASName);
     }
     AnnotationSet containingAnns = inputAS.get(containingType);
     for (Annotation containingAnn : containingAnns) {
       annotateText(
           document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn));
     }
   }
 }
Пример #3
0
 private static Document readDocument(String gateDocumentString)
     throws ResourceInstantiationException {
   Document gateDocument =
       (Document)
           Factory.createResource(
               "gate.corpora.DocumentImpl",
               Utils.featureMap(
                   "stringContent",
                   gateDocumentString,
                   "mimeType",
                   "text/xml",
                   "encoding",
                   "UTF-8"));
   return gateDocument;
 }
  @Override
  public void execute() throws ExecutionException {
    Document doc = getDocument();
    AnnotationSet as = doc.getAnnotations(getAnnotationSetName());
    AnnotationSet tocs = as.get(getTokenAnnotationTypeName());

    try {

      for (Annotation t : tocs) {
        String content = Utils.stringFor(doc, t);
        String val = getOrthographyValue(content);
        if (val != null) t.getFeatures().put("orth", val);
      }

    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Пример #5
0
  /**
   * Method is executed after the init() method has finished its execution. <br>
   *
   * @throws ExecutionException
   */
  public void execute() throws ExecutionException {
    // lets start the progress and initialize the progress counter
    fireProgressChanged(0);

    // If no document provided to process throw an exception
    if (document == null) {
      fireProcessFinished();
      throw new GateRuntimeException("No document to process!");
    }

    // langugage ID feature Name
    if (languageIdFeatureName == null || languageIdFeatureName.trim().length() == 0)
      languageIdFeatureName = "lang";

    /* Default behaviour: classify the text of the whole document and
     * store the result as a document feature.     */
    if ((annotationType == null) || (annotationType.length() == 0)) {
      String docText = document.getContent().toString();
      Classification classification = classifier.classify(docText);
      document.getFeatures().put(languageIdFeatureName, classification.bestCategory());
    }

    /* Optional behaviour: classify the text underlying each annotation
     * and store each results as an annotation feature.     */
    else {
      AnnotationSet annotations = document.getAnnotations(annotationSetName).get(annotationType);

      for (Annotation annotation : annotations) {
        String text = Utils.stringFor(document, annotation);
        Classification classification = classifier.classify(text);
        annotation.getFeatures().put(languageIdFeatureName, classification.bestCategory());
      }
    }

    // process finished, acknowledge user about this.
    fireProcessFinished();
  }
  // carry out the actual annotations on the given span of text in the
  // document.
  protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) {
    String text = "";
    try {
      text = doc.getContent().getContent(from, to).toString();
    } catch (InvalidOffsetException ex) {
      throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to);
    }
    // send the text to the service and get back the response
    // System.out.println("Annotating text: "+text);
    // System.out.println("Starting offset is "+from);

    // NOTE: there is a bug in the TagMe service which causes offset errors
    // if we use the tweet mode and there are certain patterns in the tweet.
    // The approach recommended by Francesco Piccinno is to replace those
    // patterns by spaces.
    if (getIsTweet()) {
      logger.debug("Text before cleaning: >>" + text + "<<");
      // replace
      text = text.replaceAll(patternStringRT3, "    ");
      text = text.replaceAll(patternStringRT2, "   ");
      text = text.replaceAll(patternHashTag, " $1");
      // now replace the remaining patterns by spaces
      StringBuilder sb = new StringBuilder(text);
      Matcher m = patternUrl.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      m = patternUser.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      text = sb.toString();
      logger.debug("Text after cleaning:  >>" + text + "<<");
    }
    TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text);
    for (TagMeAnnotation tagmeAnn : tagmeAnnotations) {
      if (tagmeAnn.rho >= minrho) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("tagMeId", tagmeAnn.id);
        fm.put("title", tagmeAnn.title);
        fm.put("rho", tagmeAnn.rho);
        fm.put("spot", tagmeAnn.spot);
        fm.put("link_probability", tagmeAnn.link_probability);
        if (tagmeAnn.title == null) {
          throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn);
        } else {
          fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title));
        }
        try {
          gate.Utils.addAnn(
              outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm);
        } catch (Exception ex) {
          System.err.println(
              "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage());
          ex.printStackTrace(System.err);
          System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn);
        }
      }
    }
  }