예제 #1
0
  @Override
  public void execute() throws ExecutionException {
    initBeforeExecute();

    AnnotationSet tokensAndDependenciesAS = inputAS;
    TreeIndex index =
        new GateAwareTreeIndex(
            tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"})));

    QueryData data =
        new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS));

    Iterable<QueryMatch> results = queryObject.evaluate(data);

    int queryMatchOrd = 0;

    for (QueryMatch result : results) {
      queryMatchOrd++;
      for (NodeMatch match : result.getMatchingNodes()) {
        String name = match.getQueryNode().getName();
        if (name != null) {
          Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId());
          FeatureMap fm = Factory.newFeatureMap();
          fm.put("matchingNodeId", match.getNodeId());
          fm.put(
              "queryMatchId",
              String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd));
          outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm);
        }
      }
    }
  }
예제 #2
0
  private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID)
      throws ExecutionException {
    for (Annotation annot : toTransfer) {
      Mapping m = mappings.get(annot.getType());

      String name = (m == null || m.newName == null ? annot.getType() : m.newName);

      try {
        FeatureMap params = Factory.newFeatureMap();
        params.putAll(annot.getFeatures());
        if (newID) {
          to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params);
        } else {
          to.add(
              annot.getId(),
              annot.getStartNode().getOffset(),
              annot.getEndNode().getOffset(),
              name,
              params);
        }
      } catch (InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }
  }
예제 #3
0
  /** Annotation remove event */
  public void annotationRemoved(AnnotationSetEvent ase) {
    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);

        // find out the details which refer to the deleted annotation
        OffsetDetails od = getOffsetDetails(docID, as.getName(), annot);
        if (od == null) continue;

        if (defaultAS) {
          aDoc.getAnnotations().remove(od.getOriginalAnnotation());
        } else {
          aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation());
        }
        removeOffsetDetails(docID, od);
        break;
      }
    }
  }
예제 #4
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
  @SuppressWarnings("unchecked")
  public static AnnotationDiffer computeDiffWithDocFeatures(
      Document document, List<String> featureNames, AnnotationSet responsesAnnotations) {
    FeatureMap doc_fm = document.getFeatures();
    // Logger log = Logger.getLogger(DocumentFeaturesDiff.class);

    int correct = 0;
    int missing = 0;
    int spurious = 0;

    for (String feature_name : featureNames) {
      // int cur_correct = 0;

      List<String> f = (List<String>) doc_fm.get(feature_name);
      if (f == null) {
        f = (List<String>) doc_fm.get(feature_name + "s");
      }

      AnnotationDiffer diff =
          computeDiffWithGoldStandardDataForSingleFeature(
              feature_name, Utils.setFromList(f), responsesAnnotations);

      spurious += diff.getSpurious();
      correct += diff.getCorrectMatches();
      missing += diff.getMissing();
    }

    return new AnnotationDifferDocumentFeaturesImpl(correct, missing, spurious);
  }
예제 #6
0
  public JSONObject persian_sentiment(String text) throws Exception {

    oncreate();

    File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp");
    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application

    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    // process the files one by one

    // load the document (using the specified encoding if one was given)

    Document doc = Factory.newDocument(text);

    // put the document in the corpus
    corpus.add(doc);

    // run the application
    application.execute();

    String featureName = "Doc_sentiment";
    FeatureMap features = doc.getFeatures();
    // remove the document from the corpus again
    corpus.clear();

    // doc.getFeatures().
    // Release the document, as it is no longer needed
    Factory.deleteResource(doc);

    LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName);

    String obj = (String) originalContent.get("sentiment");
    // BigDecimal pos =(BigDecimal) originalContent.get("positive");
    // BigDecimal neg =(BigDecimal) originalContent.get("negative");
    // System.out.println(obj);
    // create Json for response to user
    JSONObject obj1 = new JSONObject();
    obj1.put("sentiment", obj);
    /*obj1.put("positive",pos);
    //obj1.put("negative",neg);
    System.out.print("----------");
    System.out.print(obj1);
    System.out.print("----------");*/
    // application.cleanup();
    return obj1;
  }
예제 #7
0
  // generate annotations for ngrams over a larger span e.g all couples inside
  // a span of 5 tokens
  // this allows to match more variants e.g. with adjectives in the middle
  // we do not generate intermediate annotations here
  // do with only bigrams for the moment
  private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);
    try {
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;

        // create a temporary list containing all the annotations
        // at position 0
        List<Annotation> headannots = boxes.get(b);
        for (Annotation newAnn : headannots) {
          // remembering positions
          loStart = newAnn.getStartNode().getOffset();
          if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
          else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
            hiEnd = newAnn.getEndNode().getOffset();

          String string = (String) newAnn.getFeatures().get(inputAnnotationFeature);
          tempAnnotationsStartingHere.add(string);

          if (this.generateIntermediateAnnotations) {
            FeatureMap fm = Factory.newFeatureMap();
            fm.put(this.outputAnnotationFeature, string);
            outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
          }
        }

        for (int z = 1; z < window && (b + z < boxes.size()); z++) {
          // generate all possible bi-grams
          List<Annotation> current = boxes.get(b + z);
          for (Annotation newAnn : current) {
            // remembering positions
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);

            // take what is in the buffer
            // and make a new annotation out of that
            for (String s : tempAnnotationsStartingHere) {
              String combination = s + getNgramSeparator() + newString;

              // create an annotation for the combination
              FeatureMap fm = Factory.newFeatureMap();
              fm.put(this.outputAnnotationFeature, combination);
              outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
            }
          }
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
예제 #8
0
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
예제 #9
0
  private void generateNGrams(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);

    try {
      // now do the actual n-grams
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;
        for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) {
          // do the combination and dump what we've done at every step
          // e.g generate 1 grams as well as 2-grams
          List<Annotation> current = boxes.get(b + z);
          List<String> temptemp = new ArrayList<String>();
          for (Annotation newAnn : current) {
            // remembering positions
            if (loStart == null) loStart = newAnn.getStartNode().getOffset();
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);
            // TODO : what if there is no such value????
            if (tempAnnotationsStartingHere.size() == 0) {
              // create an annotation for the current annotation
              if (this.generateIntermediateAnnotations) {
                FeatureMap fm = Factory.newFeatureMap();
                fm.put(this.outputAnnotationFeature, newString);
                outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
              }
              // add it to the temp
              temptemp.add(newString);
            } else
              for (String existing : tempAnnotationsStartingHere) {
                String combination = existing + getNgramSeparator() + newString;
                temptemp.add(combination);

                if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) {
                  // create an annotation for the combination
                  FeatureMap fm = Factory.newFeatureMap();
                  fm.put(this.outputAnnotationFeature, combination);
                  outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
                }
              }
          }
          tempAnnotationsStartingHere = temptemp;
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
  public void tokenize() {
    AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) {

      Annotation currentTokenAnnotation = it.next();
      FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures();
      FeatureMap curFeaturesMap = Factory.newFeatureMap();

      if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) {
        curFeaturesMap.put("string", tokenFeaturesMap.get("string"));
        curFeaturesMap.put("root", tokenFeaturesMap.get("lemma"));
        curFeaturesMap.put("category", tokenFeaturesMap.get("POS"));

        // Add the new Token to the Annotation Set

        defaultAs.add(
            currentTokenAnnotation.getStartNode(),
            currentTokenAnnotation.getEndNode(),
            currentTokenAnnotation.getType(),
            curFeaturesMap);
      }
    }
    gateDocument.removeAnnotationSet("Tokenization");
  }
예제 #11
0
  /** Annotation added event */
  public void annotationAdded(AnnotationSetEvent ase) {

    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      annot.addAnnotationListener(this);

      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);
        long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue());
        if (stOffset == -1) continue;
        long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue());
        if (enOffset == -1) continue;
        Annotation originalAnnot = null;
        try {
          Integer id = annot.getId();
          if (defaultAS) {

            aDoc.getAnnotations()
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations().get(id);
          } else {
            aDoc.getAnnotations(as.getName())
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations(as.getName()).get(id);
          }
        } catch (InvalidOffsetException ioe) {
          System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset);
          throw new GateRuntimeException(ioe);
        }

        OffsetDetails od = new OffsetDetails();
        od.setOldStartOffset(stOffset);
        od.setOldEndOffset(enOffset);
        od.setNewStartOffset(annot.getStartNode().getOffset().longValue());
        od.setNewEndOffset(annot.getEndNode().getOffset().longValue());
        od.setOriginalAnnotation(originalAnnot);
        od.setNewAnnotation(annot);
        addNewOffsetDetails(docID, od);
        break;
      }
    }
  }
예제 #12
0
  /**
   * This method is called when the HTML parser encounts the beginning of a tag that means that the
   * tag is paired by an end tag and it's not an empty one.
   */
  @Override
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    // Fire the status listener if the elements processed exceded the rate
    if (0 == (++elements % ELEMENTS_RATE))
      fireStatusChangedEvent("Processed elements : " + elements);

    // Start of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = true;
    } // if

    // Construct a feature map from the attributes list
    FeatureMap fm = Factory.newFeatureMap();

    // Take all the attributes an put them into the feature map
    if (0 != a.getAttributeCount()) {
      Enumeration<?> enumeration = a.getAttributeNames();
      while (enumeration.hasMoreElements()) {
        Object attribute = enumeration.nextElement();
        fm.put(attribute.toString(), (a.getAttribute(attribute)).toString());
      } // while
    } // if

    // Just analize the tag t and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a readable form
    // for the final document.
    customizeAppearanceOfDocumentWithStartTag(t);

    // If until here the "tmpDocContent" ends with a NON whitespace char,
    // then we add a space char before calculating the START index of this
    // tag.
    // This is done in order not to concatenate the content of two separate tags
    // and obtain a different NEW word.
    int tmpDocContentSize = tmpDocContent.length();
    if (tmpDocContentSize != 0
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)))
      tmpDocContent.append(" ");

    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());

    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex);

    // put it into the stack
    stack.push(obj);
  } // handleStartTag
예제 #13
0
  private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc)
      throws ResourceInstantiationException, IOException {

    // can't get that to work
    // File tempDirectory = new
    // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name",
    // "./tmp"));
    // LOG.info("tempDirectory "+tempDirectory);
    //
    // tempDirectory.mkdirs();
    //
    // File tempInputFile = File.createTempFile("gateInput-",
    // inputDoc.getUrl(),tempDirectory);
    //
    // FileOutputStream fos = new FileOutputStream(tempInputFile);
    // OutputStream bout = new BufferedOutputStream(fos);
    // bout.write(inputDoc.getContent());
    // bout.flush();
    // bout.close();
    //
    // URL url;
    // try {
    // url = tempInputFile.toURI().toURL();
    // } catch (MalformedURLException e) {
    // // delete the input doc
    // tempInputFile.delete();
    // throw e;
    // }

    FeatureMap params = Factory.newFeatureMap();
    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent()));
    String ct = inputDoc.getContentType();
    if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct);

    gate.Document gatedocument;
    try {
      gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
    } finally {
      // delete the input doc
      // tempInputFile.delete();
    }

    return gatedocument;
  }
예제 #14
0
 @Override
 public int hashCode() {
   final int prime = 31;
   int result = 1;
   result = prime * result + ((features == null) ? 0 : features.hashCode());
   result = prime * result + Arrays.hashCode(members);
   result = prime * result + ((type == null) ? 0 : type.hashCode());
   result = prime * result + ((userData == null) ? 0 : userData.hashCode());
   return result;
 }
예제 #15
0
  @Override
  public Object getObject() throws IOException {
    ensureGateInit();
    FeatureMap fm = Factory.newFeatureMap();
    if (sourceMap != null) {
      for (Map.Entry<Object, Object> entry : sourceMap.entrySet()) {
        Object key = entry.getKey();
        Object value = entry.getValue();

        // convert Spring resources to URLs
        if (value instanceof Resource) {
          value = SpringFactory.resourceToUrl((Resource) value);
        }

        fm.put(key, value);
      }
    }

    return fm;
  }
예제 #16
0
 /**
  * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui.
  */
 void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN)
     throws GateException, IOException {
   LogService.minVerbosityLevel = 0;
   if (LogService.minVerbosityLevel > 0)
     System.out.println("Learning Home : " + learningHome.getAbsolutePath());
   FeatureMap parameters = Factory.newFeatureMap();
   URL configFileURL = new File(configFileName).toURI().toURL();
   parameters.put("configFileURL", configFileURL);
   learningApi =
       (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters);
   // Load the corpus
   corpus = Factory.newCorpus("DataSet");
   ExtensionFileFilter fileFilter = new ExtensionFileFilter();
   fileFilter.addExtension("xml");
   File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter);
   Arrays.sort(
       xmlFiles,
       new Comparator<File>() {
         public int compare(File a, File b) {
           return a.getName().compareTo(b.getName());
         }
       });
   for (File f : xmlFiles) {
     if (!f.isDirectory()) {
       Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8");
       doc.setName(f.getName());
       corpus.add(doc);
     }
   }
   //    URL tempURL = new File(corpusDirName).toURI().toURL();
   //    corpus.populate(tempURL, fileFilter, "UTF-8", false);
   // Set the inputAS
   learningApi.setInputASName(inputasN);
   learningApi.setOutputASName(outputasN);
   controller =
       (gate.creole.SerialAnalyserController)
           Factory.createResource("gate.creole.SerialAnalyserController");
   controller.setCorpus(corpus);
   controller.add(learningApi);
 }
 /**
  * Rename annotation
  *
  * @param outputAS output annotation set
  * @param oldType old annotation name
  * @param newType new annotation name
  */
 private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) {
   AnnotationSet tmpAnatomyAS = outputAS.get(oldType);
   for (Annotation tmpAnn : tmpAnatomyAS) {
     Long startOffset = tmpAnn.getStartNode().getOffset();
     Long endOffset = tmpAnn.getEndNode().getOffset();
     AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset);
     // If we've already got an annotation of the same name in the same place, don't add a new one
     // just delete the old one
     if (existingAS.isEmpty()) {
       FeatureMap tmpFm = tmpAnn.getFeatures();
       FeatureMap fm = Factory.newFeatureMap();
       fm.putAll(tmpFm);
       try {
         outputAS.add(startOffset, endOffset, newType, fm);
         outputAS.remove(tmpAnn);
       } catch (InvalidOffsetException ie) {
         // shouldn't happen
       }
     } else {
       outputAS.remove(tmpAnn);
     }
   }
 }
예제 #18
0
  public void execute() throws ExecutionException {
    AnnotationSet outputAS = document.getAnnotations(annotationSetName);

    String text = document.getContent().toString();

    Span[] tokens = tokenizer.getTokens(text);
    try {
      for (Span token : tokens) {
        FeatureMap features = Factory.newFeatureMap();
        features.put(
            ANNIEConstants.TOKEN_STRING_FEATURE_NAME,
            text.substring(token.getStart(), token.getEnd()));

        outputAS.add(
            (long) token.getStart(),
            (long) token.getEnd(),
            ANNIEConstants.TOKEN_ANNOTATION_TYPE,
            features);
      }
    } catch (Exception e) {
      throw new ExecutionException("error running tokenizer", e);
    }
  }
예제 #19
0
  /** This method is called when the HTML parser encounts an empty tag */
  @Override
  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    // fire the status listener if the elements processed exceded the rate
    if ((++elements % ELEMENTS_RATE) == 0)
      fireStatusChangedEvent("Processed elements : " + elements);

    // construct a feature map from the attributes list
    // these are empty elements
    FeatureMap fm = Factory.newFeatureMap();

    // take all the attributes an put them into the feature map
    if (0 != a.getAttributeCount()) {

      // Out.println("HAS  attributes = " + a.getAttributeCount ());
      Enumeration<?> enumeration = a.getAttributeNames();
      while (enumeration.hasMoreElements()) {
        Object attribute = enumeration.nextElement();
        fm.put(attribute.toString(), (a.getAttribute(attribute)).toString());
      } // while
    } // if

    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());

    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex);

    // we add the object directly into the colector
    // we don't add it to the stack because this is an empty tag
    colector.add(obj);

    // Just analize the tag t and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a readable form
    // for the final document.
    customizeAppearanceOfDocumentWithSimpleTag(t);
  } // handleSimpleTag
  /**
   * @param inputAS input annotation set
   * @param outputAS output annotation set
   * @param term String matched
   * @param startOffset match start offset
   * @param endOffset match end offset
   */
  private void addLookup(
      AnnotationSet inputAS,
      AnnotationSet outputAS,
      String term,
      String outputASType,
      Long startOffset,
      Long endOffset,
      boolean useNounChunk) {
    if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) {
      AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset);
      if (!nounChunkAS.isEmpty()) {
        startOffset = nounChunkAS.firstNode().getOffset();
        endOffset = nounChunkAS.lastNode().getOffset();
      }
    }
    try {
      AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset);
      if (diseaseAS.isEmpty()) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("match", term);
        outputAS.add(startOffset, endOffset, outputASType, fm);
      } else {
        Annotation disease = diseaseAS.iterator().next();
        FeatureMap fm = disease.getFeatures();
        String meta = (String) fm.get("match");
        if (meta != null) {
          meta = meta + " " + term;
        }
        fm.put("match", meta);
      }

    } catch (InvalidOffsetException ie) {
      // shouldn't happen
      gate.util.Err.println(ie);
    }
  }
예제 #21
0
 @Override
 public boolean equals(Object obj) {
   if (this == obj) return true;
   if (obj == null) return false;
   if (getClass() != obj.getClass()) return false;
   SimpleRelation other = (SimpleRelation) obj;
   if (features == null) {
     if (other.features != null) return false;
   } else if (!features.equals(other.features)) return false;
   if (!Arrays.equals(members, other.members)) return false;
   if (type == null) {
     if (other.type != null) return false;
   } else if (!type.equals(other.type)) return false;
   if (userData == null) {
     if (other.userData != null) return false;
   } else if (!userData.equals(other.userData)) return false;
   return true;
 }
예제 #22
0
  /**
   * Populates this Persistence with the data that needs to be stored from the original source
   * object.
   */
  @Override
  public void extractDataFromSource(Object source) throws PersistenceException {
    if (!(source instanceof ProcessingResource)) {
      throw new UnsupportedOperationException(
          getClass().getName()
              + " can only be used for "
              + ProcessingResource.class.getName()
              + " objects!\n"
              + source.getClass().getName()
              + " is not a "
              + ProcessingResource.class.getName());
    }

    super.extractDataFromSource(source);
    Resource res = (Resource) source;

    ResourceData rData = Gate.getCreoleRegister().get(res.getClass().getName());
    if (rData == null)
      throw new PersistenceException("Could not find CREOLE data for " + res.getClass().getName());

    // now get the runtime params
    ParameterList params = rData.getParameterList();
    try {
      // get the values for the init time parameters
      runtimeParams = Factory.newFeatureMap();
      // this is a list of lists
      Iterator<List<Parameter>> parDisjIter = params.getRuntimeParameters().iterator();
      while (parDisjIter.hasNext()) {
        Iterator<Parameter> parIter = parDisjIter.next().iterator();
        while (parIter.hasNext()) {
          Parameter parameter = parIter.next();
          String parName = parameter.getName();
          Object parValue = res.getParameterValue(parName);
          ((FeatureMap) runtimeParams).put(parName, parValue);
        }
      }
      runtimeParams = PersistenceManager.getPersistentRepresentation(runtimeParams);
    } catch (ResourceInstantiationException rie) {
      throw new PersistenceException(rie);
    }
  }
예제 #23
0
 /*
  * (non-Javadoc)
  *
  * @see java.lang.Object#toString()
  */
 @Override
 public String toString() {
   StringBuilder str = new StringBuilder();
   str.append(id).append(": ");
   String typeOut =
       type.replaceAll("\\(", Matcher.quoteReplacement("\\("))
           .replaceAll("\\)", Matcher.quoteReplacement("\\)"));
   str.append(typeOut).append("(");
   for (int i = 0; i < members.length; i++) {
     if (i > 0) str.append(", ");
     str.append(members[i]);
   }
   str.append(")");
   if (features != null) {
     str.append("#").append(features.toString());
   }
   if (userData != null) {
     str.append("#").append(userData.toString());
   }
   return str.toString();
 }
예제 #24
0
  /**
   * Creates the Lookup annotations according to a gazetteer match.
   *
   * @param matchingState the final FSMState that was reached while matching.
   * @param matchedRegionStart the start of the matched text region.
   * @param matchedRegionEnd the end of the matched text region.
   * @param annotationSet the annotation set where the new annotations should be added.
   */
  protected void createLookups(
      FSMState matchingState,
      long matchedRegionStart,
      long matchedRegionEnd,
      AnnotationSet annotationSet) {
    Iterator lookupIter = matchingState.getLookupSet().iterator();
    while (lookupIter.hasNext()) {
      Lookup currentLookup = (Lookup) lookupIter.next();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
      if (null != currentLookup.oClass && null != currentLookup.ontology) {
        fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
        fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
      }

      if (null != currentLookup.minorType)
        fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
      if (null != currentLookup.languages)
        fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
      if (null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      try {
        //        if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){
        //          annotationSet.add(new Long(matchedRegionStart),
        //                          new Long(matchedRegionEnd + 1),
        //                          LOOKUP_ANNOTATION_TYPE,
        //                          fm);
        //        }else{
        annotationSet.add(
            new Long(matchedRegionStart),
            new Long(matchedRegionEnd + 1),
            currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag.
            fm);
        // }
      } catch (InvalidOffsetException ioe) {
        throw new GateRuntimeException(ioe.toString());
      }
    } // while(lookupIter.hasNext())
  }
  @Test
  public void testAddFeatureStemmingEnabled() {
    Annotation mockedAnnot1 = Mockito.mock(Annotation.class);
    Annotation mockedAnnot2 = Mockito.mock(Annotation.class);
    FeatureMap mockedMap1 = Mockito.mock(FeatureMap.class);
    FeatureMap mockedMap2 = Mockito.mock(FeatureMap.class);
    Node startNode = Mockito.mock(Node.class);
    Node endNode = Mockito.mock(Node.class);

    String wholeSentence = "First Second Third Fourth.";

    Mockito.when(startNode.getOffset()).thenReturn((long) 0);
    Mockito.when(endNode.getOffset()).thenReturn((long) 11);

    Mockito.when(mockedAnnot1.getFeatures()).thenReturn(mockedMap1);
    Mockito.when(mockedMap1.get("string")).thenReturn("First");
    Mockito.when(mockedMap1.get("stem")).thenReturn("stem1");
    Mockito.when(mockedAnnot1.getStartNode()).thenReturn(startNode);

    Mockito.when(mockedAnnot2.getFeatures()).thenReturn(mockedMap2);
    Mockito.when(mockedMap2.get("string")).thenReturn("Second");
    Mockito.when(mockedMap2.get("stem")).thenReturn("stem2");
    Mockito.when(mockedAnnot2.getEndNode()).thenReturn(endNode);

    Document gateDocument = Mockito.mock(Document.class);
    Mockito.when(gateDocument.getName()).thenReturn("doc1");

    ArrayList<Annotation> featureAnnots = new ArrayList<Annotation>();
    featureAnnots.add(mockedAnnot1);
    featureAnnots.add(mockedAnnot2);

    Mockito.when(options.isEnableStemming()).thenReturn(true);

    String featureString = "First Second";
    String featureStem = "stem1 stem2";
    featureContainer.addFeature(featureAnnots, wholeSentence, gateDocument, "content");

    Assert.assertTrue(featureContainer.getFeatureDictionary().get(featureString) != null);
    Assert.assertTrue(featureContainer.getFeatureStorage().get(featureStem) != null);
  }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
  // carry out the actual annotations on the given span of text in the
  // document.
  protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) {
    String text = "";
    try {
      text = doc.getContent().getContent(from, to).toString();
    } catch (InvalidOffsetException ex) {
      throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to);
    }
    // send the text to the service and get back the response
    // System.out.println("Annotating text: "+text);
    // System.out.println("Starting offset is "+from);

    // NOTE: there is a bug in the TagMe service which causes offset errors
    // if we use the tweet mode and there are certain patterns in the tweet.
    // The approach recommended by Francesco Piccinno is to replace those
    // patterns by spaces.
    if (getIsTweet()) {
      logger.debug("Text before cleaning: >>" + text + "<<");
      // replace
      text = text.replaceAll(patternStringRT3, "    ");
      text = text.replaceAll(patternStringRT2, "   ");
      text = text.replaceAll(patternHashTag, " $1");
      // now replace the remaining patterns by spaces
      StringBuilder sb = new StringBuilder(text);
      Matcher m = patternUrl.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      m = patternUser.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      text = sb.toString();
      logger.debug("Text after cleaning:  >>" + text + "<<");
    }
    TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text);
    for (TagMeAnnotation tagmeAnn : tagmeAnnotations) {
      if (tagmeAnn.rho >= minrho) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("tagMeId", tagmeAnn.id);
        fm.put("title", tagmeAnn.title);
        fm.put("rho", tagmeAnn.rho);
        fm.put("spot", tagmeAnn.spot);
        fm.put("link_probability", tagmeAnn.link_probability);
        if (tagmeAnn.title == null) {
          throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn);
        } else {
          fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title));
        }
        try {
          gate.Utils.addAnn(
              outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm);
        } catch (Exception ex) {
          System.err.println(
              "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage());
          ex.printStackTrace(System.err);
          System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn);
        }
      }
    }
  }
  @Override
  public Resource init() throws ResourceInstantiationException {
    gracefulExit = false;

    if (configFileURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No configuration file provided!");
    }

    if (japeURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No JAPE grammar file provided!");
    }

    // create the init params for the JAPE transducer
    FeatureMap params = Factory.newFeatureMap();
    params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL);
    // Code borrowed from Mark Greenwood's Measurements PR
    if (japeTransducer == null) {
      // if this is the first time we are running init then actually create a
      // new transducer as we don't already have one
      FeatureMap hidden = Factory.newFeatureMap();
      Gate.setHiddenAttribute(hidden, true);
      japeTransducer =
          (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden);
    } else {
      // we are being run through a call to reInit so simply re-init the
      // underlying JAPE transducer
      japeTransducer.setParameterValues(params);
      japeTransducer.reInit();
    }

    ConfigReader config = new ConfigReader(configFileURL);
    gracefulExit = config.config();

    try {
      HashMap<String, String> options = config.getOptions();

      patternMap = new HashMap<String, Pattern>();
      addSuffixPattern("disease_suffix", options);
      addWordPattern("disease_abbrevs", options);
      addWordPattern("disease_sense", options);
      addWordExtraPattern("disease_sense_context", options);
      addPossessiveWordPattern("disease_named_syndrome", options);
      addWordExtraPattern("disease_generic_context", options);
      addWordExtraPattern("disease_anatomy_context", options);
      addSuffixPluralPattern("procedure_suffix", options);
      addWordPluralPattern("procedure_key", options);
      addWordExtraPattern("procedure_anatomy_context", options);
      addWordPluralPattern("symptom_key", options);
      addWordPattern("test_key", options);

      addSuffixPattern("anatomy_suffix_adjective", options);
      addSuffixPattern("anatomy_suffix", options);
      addPrefixPattern("anatomy_prefix", options);
      addWordPattern("anatomy_position", options);
      addWordPluralPattern("anatomy_space_region_junction", options);
      addWordPattern("anatomy_part_adjective", options);
      addWordPattern("anatomy_latin_noun", options);
      addWordPattern("anatomy_muscle", options);
      addWordPluralPattern("anatomy_part", options);
      addWordPluralPattern("anatomy_fluid", options);

    } catch (NullPointerException ne) {
      gracefulExit = true;
      gate.util.Err.println(
          "Missing or unset configuration options. Please check configuration file.");
    }

    return this;
  } // end init()
예제 #29
0
  /**
   * @param uri - namespace uri
   * @param localName - local, unprefixed element name
   * @param qName - fully qualified, prefixed element name
   * @param atts
   * @throws SAXException
   */
  @Override
  public void startElement(String uri, String localName, String qName, Attributes atts)
      throws SAXException {

    // call characterActions
    if (readCharacterStatus) {
      readCharacterStatus = false;
      charactersAction(new String(contentBuffer).toCharArray(), 0, contentBuffer.length());
    }

    // Inform the progress listener to fire only if no of elements processed
    // so far is a multiple of ELEMENTS_RATE
    if ((++elements % ELEMENTS_RATE) == 0) {
      fireStatusChangedEvent("Processed elements : " + elements);
    }

    Integer customObjectId = null;
    // Construct a SimpleFeatureMapImpl from the list of attributes
    FeatureMap fm = Factory.newFeatureMap();

    /**
     * Use localName rather than qName and add the namespace prefix and uri as features if global
     * flag is set
     */
    String elemName = qName;
    boolean hasNSUri = (uri != null && !uri.isEmpty());
    if (deserializeNamespaceInfo && hasNSUri) {
      elemName = localName;
      StringTokenizer strToken = new StringTokenizer(qName, ":");
      if (strToken.countTokens() > 1) {
        String nsPrefix = strToken.nextToken();
        fm.put(namespaceURIFeature, uri);
        fm.put(namespacePrefixFeature, nsPrefix);
      }
    }

    // Get the name and the value of the attributes and add them to a FeaturesMAP
    for (int i = 0; i < atts.getLength(); i++) {
      String attName = atts.getLocalName(i);
      String attValue = atts.getValue(i);
      String attUri = atts.getURI(i);
      if (attUri != null && Gate.URI.equals(attUri)) {
        if ("gateId".equals(attName)) {
          customObjectId = new Integer(attValue);
        } // End if
        if ("annotMaxId".equals(attName)) {
          customObjectsId = new Integer(attValue).intValue();
        } // End if
        if ("matches".equals(attName)) {
          StringTokenizer strTokenizer = new StringTokenizer(attValue, ";");
          List<Integer> list = new ArrayList<Integer>();
          // Take all tokens,create Integers and add them to the list
          while (strTokenizer.hasMoreTokens()) {
            String token = strTokenizer.nextToken();
            list.add(new Integer(token));
          } // End while
          fm.put(attName, list);
        } // End if
      } else {
        fm.put(atts.getQName(i), attValue);
      } // End if
    } // End for

    // create the START index of the annotation
    Long startIndex = new Long(tmpDocContent.length());

    // initialy the Start index is equal with End index
    CustomObject obj = new CustomObject(customObjectId, elemName, fm, startIndex, startIndex);

    // put this object into the stack
    stack.push(obj);
  } // startElement();