Пример #1
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
 @SuppressWarnings("unchecked")
 public Set<String> processDoc(String str) throws Exception {
   Set<String> toReturn = new HashSet<String>();
   Corpus c = null;
   Document aDoc = null;
   try {
     c = Factory.newCorpus("sample");
     aDoc = Factory.newDocument(str);
     c.add(aDoc);
     controller.setCorpus(c);
     controller.execute();
     AnnotationSet aSet = aDoc.getAnnotations("StockSymbols");
     for (Annotation annot : aSet) {
       String symbol = (String) annot.getFeatures().get("sym");
       toReturn.add(symbol);
     }
   } catch (Exception e) {
     throw e;
   } finally {
     if (aDoc != null) {
       Factory.deleteResource(aDoc);
     }
     if (c != null) {
       Factory.deleteResource(c);
     }
   }
   return toReturn;
 }
Пример #3
0
  /** Test the default tokeniser */
  public void testHashGazetteer() throws Exception {
    // get a document
    Document doc =
        Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html"));

    System.out.println(doc.getFeatures().get("gate.SourceURL"));

    // create a default gazetteer
    FeatureMap params = Factory.newFeatureMap();
    HashGazetteer gaz =
        (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params);

    // runtime stuff
    gaz.setDocument(doc);
    gaz.setAnnotationSetName(GAZ_AS);
    gaz.execute();

    assertTrue(
        "the Annotation set resulting of the execution of the OntoText "
            + "Natural Gazetteer is empty.",
        !doc.getAnnotations(GAZ_AS).isEmpty());

    // check whether the annotations are as expected
    assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size());
  } // testHashGazetteer();
Пример #4
0
 /**
  * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM
  * rather than re-loading the lists.
  */
 @Override
 public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException {
   return Factory.createResource(
       SharedDefaultGazetteer.class.getName(),
       Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this),
       Factory.duplicate(this.getFeatures(), ctx),
       this.getName());
 }
Пример #5
0
 /** Clear up the resources used after one test. */
 private void clearOneTest() {
   corpus.clear();
   Factory.deleteResource(corpus);
   Factory.deleteResource(learningApi);
   controller.remove(learningApi);
   controller.cleanup();
   Factory.deleteResource(controller);
 }
Пример #6
0
  public JSONObject persian_sentiment(String text) throws Exception {

    oncreate();

    File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp");
    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application

    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    // process the files one by one

    // load the document (using the specified encoding if one was given)

    Document doc = Factory.newDocument(text);

    // put the document in the corpus
    corpus.add(doc);

    // run the application
    application.execute();

    String featureName = "Doc_sentiment";
    FeatureMap features = doc.getFeatures();
    // remove the document from the corpus again
    corpus.clear();

    // doc.getFeatures().
    // Release the document, as it is no longer needed
    Factory.deleteResource(doc);

    LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName);

    String obj = (String) originalContent.get("sentiment");
    // BigDecimal pos =(BigDecimal) originalContent.get("positive");
    // BigDecimal neg =(BigDecimal) originalContent.get("negative");
    // System.out.println(obj);
    // create Json for response to user
    JSONObject obj1 = new JSONObject();
    obj1.put("sentiment", obj);
    /*obj1.put("positive",pos);
    //obj1.put("negative",neg);
    System.out.print("----------");
    System.out.print(obj1);
    System.out.print("----------");*/
    // application.cleanup();
    return obj1;
  }
Пример #7
0
  // generate annotations for ngrams over a larger span e.g all couples inside
  // a span of 5 tokens
  // this allows to match more variants e.g. with adjectives in the middle
  // we do not generate intermediate annotations here
  // do with only bigrams for the moment
  private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);
    try {
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;

        // create a temporary list containing all the annotations
        // at position 0
        List<Annotation> headannots = boxes.get(b);
        for (Annotation newAnn : headannots) {
          // remembering positions
          loStart = newAnn.getStartNode().getOffset();
          if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
          else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
            hiEnd = newAnn.getEndNode().getOffset();

          String string = (String) newAnn.getFeatures().get(inputAnnotationFeature);
          tempAnnotationsStartingHere.add(string);

          if (this.generateIntermediateAnnotations) {
            FeatureMap fm = Factory.newFeatureMap();
            fm.put(this.outputAnnotationFeature, string);
            outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
          }
        }

        for (int z = 1; z < window && (b + z < boxes.size()); z++) {
          // generate all possible bi-grams
          List<Annotation> current = boxes.get(b + z);
          for (Annotation newAnn : current) {
            // remembering positions
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);

            // take what is in the buffer
            // and make a new annotation out of that
            for (String s : tempAnnotationsStartingHere) {
              String combination = s + getNgramSeparator() + newString;

              // create an annotation for the combination
              FeatureMap fm = Factory.newFeatureMap();
              fm.put(this.outputAnnotationFeature, combination);
              outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
            }
          }
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Пример #8
0
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
Пример #9
0
  private void generateNGrams(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);

    try {
      // now do the actual n-grams
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;
        for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) {
          // do the combination and dump what we've done at every step
          // e.g generate 1 grams as well as 2-grams
          List<Annotation> current = boxes.get(b + z);
          List<String> temptemp = new ArrayList<String>();
          for (Annotation newAnn : current) {
            // remembering positions
            if (loStart == null) loStart = newAnn.getStartNode().getOffset();
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);
            // TODO : what if there is no such value????
            if (tempAnnotationsStartingHere.size() == 0) {
              // create an annotation for the current annotation
              if (this.generateIntermediateAnnotations) {
                FeatureMap fm = Factory.newFeatureMap();
                fm.put(this.outputAnnotationFeature, newString);
                outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
              }
              // add it to the temp
              temptemp.add(newString);
            } else
              for (String existing : tempAnnotationsStartingHere) {
                String combination = existing + getNgramSeparator() + newString;
                temptemp.add(combination);

                if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) {
                  // create an annotation for the combination
                  FeatureMap fm = Factory.newFeatureMap();
                  fm.put(this.outputAnnotationFeature, combination);
                  outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
                }
              }
          }
          tempAnnotationsStartingHere = temptemp;
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Пример #10
0
 /**
  * Call the given closure passing this resource as a parameter, and ensuring that the resource is
  * deleted when the closure returns. This would typically be used in this kind of construction:
  *
  * <pre>
  * Factory.newDocument(someUrl).withResource {
  *   // do something with the document (it)
  * }
  * </pre>
  *
  * @param self
  * @param closure
  * @return the value returned from the closure
  */
 public static <T> T withResource(Resource self, Closure<T> closure) {
   try {
     return closure.call(self);
   } finally {
     Factory.deleteResource(self);
   }
 }
Пример #11
0
  private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID)
      throws ExecutionException {
    for (Annotation annot : toTransfer) {
      Mapping m = mappings.get(annot.getType());

      String name = (m == null || m.newName == null ? annot.getType() : m.newName);

      try {
        FeatureMap params = Factory.newFeatureMap();
        params.putAll(annot.getFeatures());
        if (newID) {
          to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params);
        } else {
          to.add(
              annot.getId(),
              annot.getStartNode().getOffset(),
              annot.getEndNode().getOffset(),
              name,
              params);
        }
      } catch (InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }
  }
Пример #12
0
  /** Annotation remove event */
  public void annotationRemoved(AnnotationSetEvent ase) {
    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);

        // find out the details which refer to the deleted annotation
        OffsetDetails od = getOffsetDetails(docID, as.getName(), annot);
        if (od == null) continue;

        if (defaultAS) {
          aDoc.getAnnotations().remove(od.getOriginalAnnotation());
        } else {
          aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation());
        }
        removeOffsetDetails(docID, od);
        break;
      }
    }
  }
Пример #13
0
  private static gate.Corpus createCorpus(ArrayList<String> files) throws GateException {

    gate.Corpus corpus = Factory.newCorpus("Transient Gate Corpus");
    for (String file : files) {
      System.out.print("\t " + file);
      try {
        corpus.add(Factory.newDocument(new File(file).toURL()));
        System.out.println(" -- success");
      } catch (gate.creole.ResourceInstantiationException e) {
        System.out.println(" -- failed (" + e.getMessage() + ")");
      } catch (Exception e) {
        System.out.println(" -- " + e.getMessage());
      }
    }
    return corpus;
  }
  public void tokenize() {
    AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) {

      Annotation currentTokenAnnotation = it.next();
      FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures();
      FeatureMap curFeaturesMap = Factory.newFeatureMap();

      if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) {
        curFeaturesMap.put("string", tokenFeaturesMap.get("string"));
        curFeaturesMap.put("root", tokenFeaturesMap.get("lemma"));
        curFeaturesMap.put("category", tokenFeaturesMap.get("POS"));

        // Add the new Token to the Annotation Set

        defaultAs.add(
            currentTokenAnnotation.getStartNode(),
            currentTokenAnnotation.getEndNode(),
            currentTokenAnnotation.getType(),
            curFeaturesMap);
      }
    }
    gateDocument.removeAnnotationSet("Tokenization");
  }
Пример #15
0
 /**
  * You should never create instances of this class directly, you should create new relations via
  * the appropriate methods of {@link RelationSet}. This method is only publicly available to
  * support persistence.
  */
 public SimpleRelation(int id, String type, int[] members) {
   super();
   this.id = id;
   this.type = type;
   this.members = members;
   features = Factory.newFeatureMap();
 }
Пример #16
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
Пример #17
0
  @Override
  public void execute() throws ExecutionException {
    initBeforeExecute();

    AnnotationSet tokensAndDependenciesAS = inputAS;
    TreeIndex index =
        new GateAwareTreeIndex(
            tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"})));

    QueryData data =
        new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS));

    Iterable<QueryMatch> results = queryObject.evaluate(data);

    int queryMatchOrd = 0;

    for (QueryMatch result : results) {
      queryMatchOrd++;
      for (NodeMatch match : result.getMatchingNodes()) {
        String name = match.getQueryNode().getName();
        if (name != null) {
          Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId());
          FeatureMap fm = Factory.newFeatureMap();
          fm.put("matchingNodeId", match.getNodeId());
          fm.put(
              "queryMatchId",
              String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd));
          outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm);
        }
      }
    }
  }
Пример #18
0
  public void execute() throws ExecutionException {

    AnnotationSet outputAS = document.getAnnotations(outputASName);

    List<Annotation> tokens =
        new ArrayList<Annotation>(
            document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE));
    Collections.sort(tokens, new OffsetComparator());

    String[] strings = new String[tokens.size()];

    for (int i = 0; i < tokens.size(); ++i) {
      strings[i] = (String) tokens.get(i).getFeatures().get("string");
    }

    try {
      TagList tags = tagger.tag(strings);

      Iterator<Tag> it = tags.iterator();
      while (it.hasNext()) {

        Tag tag = it.next();

        outputAS.add(
            tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(),
            tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(),
            tag.getTagname(),
            Factory.newFeatureMap());
      }
    } catch (Exception ioe) {
      throw new ExecutionException("Tagger Failed", ioe);
    }
  }
Пример #19
0
 @Override
 public void datastoreClosed(CreoleEvent e) {
   if (!e.getDatastore().equals(this.getDataStore())) return;
   if (this.getDataStore() != null) this.getDataStore().removeDatastoreListener(this);
   // close this corpus, since it cannot stay open when the DS it comes
   // from
   // is closed
   Factory.deleteResource(this);
 }
Пример #20
0
  private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc)
      throws ResourceInstantiationException, IOException {

    // can't get that to work
    // File tempDirectory = new
    // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name",
    // "./tmp"));
    // LOG.info("tempDirectory "+tempDirectory);
    //
    // tempDirectory.mkdirs();
    //
    // File tempInputFile = File.createTempFile("gateInput-",
    // inputDoc.getUrl(),tempDirectory);
    //
    // FileOutputStream fos = new FileOutputStream(tempInputFile);
    // OutputStream bout = new BufferedOutputStream(fos);
    // bout.write(inputDoc.getContent());
    // bout.flush();
    // bout.close();
    //
    // URL url;
    // try {
    // url = tempInputFile.toURI().toURL();
    // } catch (MalformedURLException e) {
    // // delete the input doc
    // tempInputFile.delete();
    // throw e;
    // }

    FeatureMap params = Factory.newFeatureMap();
    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent()));
    String ct = inputDoc.getContentType();
    if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct);

    gate.Document gatedocument;
    try {
      gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
    } finally {
      // delete the input doc
      // tempInputFile.delete();
    }

    return gatedocument;
  }
Пример #21
0
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return the corpus.
  */
 public static <T> Object each(Corpus self, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     closure.call(doc);
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return self;
 }
Пример #22
0
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus, and adding the return values of each call to
  * the given collection.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return a list of the return values from each closure call.
  */
 public static <T> Collection<T> collect(Corpus self, Collection<T> coll, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     coll.add(closure.call(doc));
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return coll;
 }
Пример #23
0
 /**
  * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui.
  */
 void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN)
     throws GateException, IOException {
   LogService.minVerbosityLevel = 0;
   if (LogService.minVerbosityLevel > 0)
     System.out.println("Learning Home : " + learningHome.getAbsolutePath());
   FeatureMap parameters = Factory.newFeatureMap();
   URL configFileURL = new File(configFileName).toURI().toURL();
   parameters.put("configFileURL", configFileURL);
   learningApi =
       (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters);
   // Load the corpus
   corpus = Factory.newCorpus("DataSet");
   ExtensionFileFilter fileFilter = new ExtensionFileFilter();
   fileFilter.addExtension("xml");
   File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter);
   Arrays.sort(
       xmlFiles,
       new Comparator<File>() {
         public int compare(File a, File b) {
           return a.getName().compareTo(b.getName());
         }
       });
   for (File f : xmlFiles) {
     if (!f.isDirectory()) {
       Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8");
       doc.setName(f.getName());
       corpus.add(doc);
     }
   }
   //    URL tempURL = new File(corpusDirName).toURI().toURL();
   //    corpus.populate(tempURL, fileFilter, "UTF-8", false);
   // Set the inputAS
   learningApi.setInputASName(inputasN);
   learningApi.setOutputASName(outputasN);
   controller =
       (gate.creole.SerialAnalyserController)
           Factory.createResource("gate.creole.SerialAnalyserController");
   controller.setCorpus(corpus);
   controller.add(learningApi);
 }
Пример #24
0
  /** Called by a datastore when a resource has been deleted */
  @Override
  public void resourceDeleted(DatastoreEvent evt) {
    DataStore ds = (DataStore) evt.getSource();
    // 1. check whether this datastore fired the event. If not, return.
    if (!ds.equals(this.dataStore)) return;

    Object docID = evt.getResourceID();
    if (docID == null) return;

    if (DEBUG) Out.prln("Resource deleted called for: " + docID);
    // first check if it is this corpus that's been deleted, it must be
    // unloaded immediately
    if (docID.equals(this.getLRPersistenceId())) {
      Factory.deleteResource(this);
      return;
    } // if

    boolean isDirty = false;
    // the problem here is that I only have the doc persistent ID
    // and nothing else, so I need to determine the index of the doc
    // first
    for (int i = 0; i < docDataList.size(); i++) {
      DocumentData docData = docDataList.get(i);
      // we've found the correct document
      // don't break the loop, because it might appear more than once
      if (docID.equals(docData.getPersistentID())) {
        if (evt.getResource() == null) {
          // instead of calling remove() which tries to load the
          // document
          // remove it from the documents and docDataList
          documentRemoved(docDataList.get(i).persistentID.toString());
          docDataList.remove(i);
          documents.remove(i);
          isDirty = true;
          i--;
          continue;
        }

        remove(i);
        isDirty = true;
      } // if
    } // for loop through the doc data

    if (isDirty)
      try {
        this.dataStore.sync(this);
      } catch (PersistenceException ex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
      } catch (SecurityException sex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
      }
  } // resourceDeleted
Пример #25
0
  public static void initController()
      throws PersistenceException, ResourceInstantiationException, IOException {
    GateUtils.deleteAllPublicGateResources();

    controller =
        (ConditionalSerialAnalyserController)
            PersistenceManager.loadObjectFromFile(
                new File("C:/Users/dedek/Desktop/DATLOWE/gate_apps/all.gapp"));

    controller.add(ie.getPR());

    corpus = Factory.newCorpus("SpcCorp");
    controller.setCorpus(corpus);
  }
Пример #26
0
 private static Document readDocument(String gateDocumentString)
     throws ResourceInstantiationException {
   Document gateDocument =
       (Document)
           Factory.createResource(
               "gate.corpora.DocumentImpl",
               Utils.featureMap(
                   "stringContent",
                   gateDocumentString,
                   "mimeType",
                   "text/xml",
                   "encoding",
                   "UTF-8"));
   return gateDocument;
 }
  public static void main(String[] args) throws Exception {
    // Logger.getLogger(DocumentFeaturesDiff.class).setLevel(Level.ALL);

    GateUtils.initGateKeepLog();
    GateUtils.registerCzsemPlugin();

    ProcessingResource eval =
        new PRSetup.SinglePRSetup(LearningEvaluator.class)
            .putFeature("keyASName", ":-)")
            //				.putFeature("responseASName", "lemma_flex")
            .putFeature("responseASName", "flex")
            .putFeature("keyAnnotationsAreInDocumentFeatures", true)
            .putFeatureList("annotationTypes", "Lookup")
            .putFeatureList("featureNames", "meshID")
            .createPR();

    SerialAnalyserController controller =
        (SerialAnalyserController)
            Factory.createResource(SerialAnalyserController.class.getCanonicalName());

    controller.add(eval);

    Corpus corpus = Factory.newCorpus(null);
    corpus.populate(
        new File("C:\\Users\\dedek\\Desktop\\bmc\\experiment\\analyzed").toURI().toURL(),
        //				new File("C:\\Users\\dedek\\Desktop\\bmca_devel").toURI().toURL(),
        null,
        "utf8",
        false);

    System.err.println("populated");

    controller.setCorpus(corpus);

    controller.execute();
  }
Пример #28
0
  /**
   * This method is called when the HTML parser encounts the beginning of a tag that means that the
   * tag is paired by an end tag and it's not an empty one.
   */
  @Override
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    // Fire the status listener if the elements processed exceded the rate
    if (0 == (++elements % ELEMENTS_RATE))
      fireStatusChangedEvent("Processed elements : " + elements);

    // Start of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = true;
    } // if

    // Construct a feature map from the attributes list
    FeatureMap fm = Factory.newFeatureMap();

    // Take all the attributes an put them into the feature map
    if (0 != a.getAttributeCount()) {
      Enumeration<?> enumeration = a.getAttributeNames();
      while (enumeration.hasMoreElements()) {
        Object attribute = enumeration.nextElement();
        fm.put(attribute.toString(), (a.getAttribute(attribute)).toString());
      } // while
    } // if

    // Just analize the tag t and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a readable form
    // for the final document.
    customizeAppearanceOfDocumentWithStartTag(t);

    // If until here the "tmpDocContent" ends with a NON whitespace char,
    // then we add a space char before calculating the START index of this
    // tag.
    // This is done in order not to concatenate the content of two separate tags
    // and obtain a different NEW word.
    int tmpDocContentSize = tmpDocContent.length();
    if (tmpDocContentSize != 0
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)))
      tmpDocContent.append(" ");

    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());

    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex);

    // put it into the stack
    stack.push(obj);
  } // handleStartTag
Пример #29
0
  /** Annotation added event */
  public void annotationAdded(AnnotationSetEvent ase) {

    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      annot.addAnnotationListener(this);

      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);
        long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue());
        if (stOffset == -1) continue;
        long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue());
        if (enOffset == -1) continue;
        Annotation originalAnnot = null;
        try {
          Integer id = annot.getId();
          if (defaultAS) {

            aDoc.getAnnotations()
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations().get(id);
          } else {
            aDoc.getAnnotations(as.getName())
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations(as.getName()).get(id);
          }
        } catch (InvalidOffsetException ioe) {
          System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset);
          throw new GateRuntimeException(ioe);
        }

        OffsetDetails od = new OffsetDetails();
        od.setOldStartOffset(stOffset);
        od.setOldEndOffset(enOffset);
        od.setNewStartOffset(annot.getStartNode().getOffset().longValue());
        od.setNewEndOffset(annot.getEndNode().getOffset().longValue());
        od.setOriginalAnnotation(originalAnnot);
        od.setNewAnnotation(annot);
        addNewOffsetDetails(docID, od);
        break;
      }
    }
  }
Пример #30
0
  public void setConf(Configuration conf) {
    config = conf;

    if (applicationDescriptorPath == null)
      throw new RuntimeException("GATE application path is null");

    // create one instance of the GATE application
    // need to avoid concurrent access to the application
    try {

      if (inited == false) {
        File gateHome = new File(applicationDescriptorPath.getFile()).getParentFile();
        LOG.info("Setting GATE_HOME as " + gateHome);
        File pluginsHome = new File(gateHome, "plugins");
        // the config files are in the job archive - not in the GATE
        // application
        // zip
        // File siteConfigFile = new File(conf
        // .getResource("site-gate.xml").getFile());
        // File userConfig = new File(conf.getResource("user-gate.xml")
        // .getFile());
        Gate.runInSandbox(true);
        Gate.setGateHome(gateHome);
        Gate.setPluginsHome(pluginsHome);
        // Gate.setSiteConfigFile(siteConfigFile);
        // Gate.setUserConfigFile(userConfig);
        // the builtInCreoleDir files
        // are stored in the same place as the config ones
        // Gate.setBuiltinCreoleDir(conf.getResource("creole.xml"));
        Gate.init();
        inited = true;
      }

      corpus = Factory.newCorpus("DummyCorpus");

      this.GATEapplication =
          (CorpusController) PersistenceManager.loadObjectFromUrl(applicationDescriptorPath);

      // load the annotation and feature filters from the configuration
      this.filters = GATEAnnotationFilters.getFilters(config);

    } catch (Exception e) {
      LOG.error("Encountered error while initialising GATE", e);
      throw new RuntimeException(e);
    }
  }