Example #1
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
Example #2
0
 /**
  * Unloads a document from memory
  *
  * @param doc the document to be unloaded
  * @param sync should the document be sync'ed (i.e. saved) before unloading.
  */
 public void unloadDocument(Document doc, boolean sync) {
   if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
   // 1. determine the index of the document; if not there, do nothing
   int index = findDocument(doc);
   if (index == -1) return;
   if (DEBUG) Out.prln("Index of doc: " + index);
   if (DEBUG) Out.prln("Size of corpus: " + documents.size());
   unloadDocument(index, sync);
   // documents.remove(new Integer(index));
 }
  /**
   * Initialise the ANNIE system. This creates a "corpus pipeline" application that can be used to
   * run sets of documents through the extraction system.
   */
  public void initAnnie() throws GateException, IOException {
    Out.prln("Initialising ANNIE...");

    // load the ANNIE application from the saved state in plugins/ANNIE
    File pluginsHome = Gate.getPluginsHome();
    File anniePlugin = new File(pluginsHome, "ANNIE");
    File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp");
    annieController = (CorpusController) PersistenceManager.loadObjectFromFile(annieGapp);

    Out.prln("...ANNIE loaded");
  } // initAnnie()
 public Object saveDocument(String doc_name, Document doc)
     throws PersistenceException, SecurityException {
   // save document in datastore
   // SecurityInfo is ingored for SerialDataStore - just pass null
   Document persistDoc = (Document) ds.adopt(doc, null);
   ds.sync(persistDoc);
   if (DEBUG) Out.prln("document saved in datastore...");
   return setDocName(doc_name, persistDoc);
 }
Example #5
0
 @Override
 public void resourceUnloaded(CreoleEvent e) {
   Resource res = e.getResource();
   if (res instanceof Document) {
     Document doc = (Document) res;
     if (DEBUG) Out.prln("resource Unloaded called ");
     // remove from the corpus too, if a transient one
     if (doc.getDataStore() != this.getDataStore()) {
       this.remove(doc);
     } else {
       // unload all occurences
       int index = indexOf(res);
       if (index < 0) return;
       documents.set(index, null);
       if (DEBUG) Out.prln("corpus: document " + index + " unloaded and set to null");
     } // if
   }
 }
 public Object saveCorpus(String corpus_name, Corpus corpus)
     throws PersistenceException, SecurityException {
   // save corpus in datastore
   // SecurityInfo is ingored for SerialDataStore - just pass null
   // a new persisent corpus is returned
   Corpus persistCorp = (Corpus) ds.adopt(corpus, null);
   ds.sync(persistCorp);
   if (DEBUG) Out.prln("corpus saved in datastore...");
   return setCorpusName(corpus_name, persistCorp);
 }
Example #7
0
 /**
  * Every LR that is a CreoleListener (and other Listeners too) must override this method and make
  * sure it removes itself from the objects which it has been listening to. Otherwise, the object
  * will not be released from memory (memory leak!).
  */
 @Override
 public void cleanup() {
   if (DEBUG) Out.prln("serial corpus cleanup called");
   if (corpusListeners != null) corpusListeners = null;
   if (documents != null) documents.clear();
   docDataList.clear();
   Gate.getCreoleRegister().removeCreoleListener(this);
   if (this.dataStore != null) {
     this.dataStore.removeDatastoreListener(this);
   }
 }
Example #8
0
  @Override
  public boolean remove(Object o) {
    if (DEBUG) Out.prln("SerialCorpus:Remove object called");
    if (!(o instanceof Document)) return false;
    Document doc = (Document) o;

    // see if we can find it first. If not, then judt return
    int index = findDocument(doc);
    if (index == -1) return false;

    if (index < docDataList.size()) { // we found it, so remove it
      // by Andrey Shafirin: this part of code can produce an exception
      // if
      // document wasn't loaded
      String docName = docDataList.get(index).getDocumentName();
      Object docPersistentID = getDocumentPersistentID(index);
      docDataList.remove(index);
      // Document oldDoc = (Document) documents.remove(index);
      documents.remove(index);
      // if (DEBUG) Out.prln("documents after remove of " +
      // oldDoc.getName()
      // + " are " + documents);
      if (DEBUG) Out.prln("documents after remove of " + docName + " are " + documents);
      // documentRemoved(oldDoc.getLRPersistenceId().toString());
      if (docPersistentID != null) documentRemoved(docPersistentID.toString());
      // fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
      // oldDoc,
      // index,
      // CorpusEvent.DOCUMENT_REMOVED));
      fireDocumentRemoved(
          new CorpusEvent(
              SerialCorpusImpl.this,
              (Document) o,
              index,
              docPersistentID,
              CorpusEvent.DOCUMENT_REMOVED));
    }

    return true;
  }
Example #9
0
  /** Called by a datastore when a resource has been deleted */
  @Override
  public void resourceDeleted(DatastoreEvent evt) {
    DataStore ds = (DataStore) evt.getSource();
    // 1. check whether this datastore fired the event. If not, return.
    if (!ds.equals(this.dataStore)) return;

    Object docID = evt.getResourceID();
    if (docID == null) return;

    if (DEBUG) Out.prln("Resource deleted called for: " + docID);
    // first check if it is this corpus that's been deleted, it must be
    // unloaded immediately
    if (docID.equals(this.getLRPersistenceId())) {
      Factory.deleteResource(this);
      return;
    } // if

    boolean isDirty = false;
    // the problem here is that I only have the doc persistent ID
    // and nothing else, so I need to determine the index of the doc
    // first
    for (int i = 0; i < docDataList.size(); i++) {
      DocumentData docData = docDataList.get(i);
      // we've found the correct document
      // don't break the loop, because it might appear more than once
      if (docID.equals(docData.getPersistentID())) {
        if (evt.getResource() == null) {
          // instead of calling remove() which tries to load the
          // document
          // remove it from the documents and docDataList
          documentRemoved(docDataList.get(i).persistentID.toString());
          docDataList.remove(i);
          documents.remove(i);
          isDirty = true;
          i--;
          continue;
        }

        remove(i);
        isDirty = true;
      } // if
    } // for loop through the doc data

    if (isDirty)
      try {
        this.dataStore.sync(this);
      } catch (PersistenceException ex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
      } catch (SecurityException sex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
      }
  } // resourceDeleted
Example #10
0
 @Override
 public Document remove(int index) {
   if (DEBUG) Out.prln("Remove index called");
   // try to get the actual document if it was loaded
   Document res = isDocumentLoaded(index) ? get(index) : null;
   Object docLRID = docDataList.get(index).persistentID;
   if (docLRID != null) documentRemoved(docLRID.toString());
   docDataList.remove(index);
   documents.remove(index);
   fireDocumentRemoved(
       new CorpusEvent(SerialCorpusImpl.this, res, index, docLRID, CorpusEvent.DOCUMENT_REMOVED));
   return res;
 }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
 /** Run ANNIE */
 public void execute() throws GateException {
   Out.prln("Running ANNIE...");
   annieController.execute();
   Out.prln("...ANNIE complete");
 } // execute()
 public void deleteDocument(Object docID) throws PersistenceException {
   // remove document from datastore
   ds.delete("gate.corpora.DocumentImpl", docID);
   if (DEBUG) Out.prln("document deleted from datastore...");
 }
 public void deleteCorpus(Object corpusID) throws PersistenceException {
   // remove corpus from datastore
   ds.delete("gate.corpora.SerialCorpusImpl", corpusID);
   if (DEBUG) Out.prln("corpus deleted from datastore...");
 }
Example #15
0
 /** This method should only be used by the Serial Datastore to set */
 public void setDocumentPersistentID(int index, Object persID) {
   if (index >= docDataList.size()) return;
   docDataList.get(index).setPersistentID(persID);
   if (DEBUG) Out.prln("IDs are now: " + docDataList);
 }