Beispiel #1
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
Beispiel #2
0
 /**
  * Unloads a document from memory
  *
  * @param doc the document to be unloaded
  * @param sync should the document be sync'ed (i.e. saved) before unloading.
  */
 public void unloadDocument(Document doc, boolean sync) {
   if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
   // 1. determine the index of the document; if not there, do nothing
   int index = findDocument(doc);
   if (index == -1) return;
   if (DEBUG) Out.prln("Index of doc: " + index);
   if (DEBUG) Out.prln("Size of corpus: " + documents.size());
   unloadDocument(index, sync);
   // documents.remove(new Integer(index));
 }
  /**
   * Initialise the ANNIE system. This creates a "corpus pipeline" application that can be used to
   * run sets of documents through the extraction system.
   */
  public void initAnnie() throws GateException, IOException {
    Out.prln("Initialising ANNIE...");

    // load the ANNIE application from the saved state in plugins/ANNIE
    File pluginsHome = Gate.getPluginsHome();
    File anniePlugin = new File(pluginsHome, "ANNIE");
    File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp");
    annieController = (CorpusController) PersistenceManager.loadObjectFromFile(annieGapp);

    Out.prln("...ANNIE loaded");
  } // initAnnie()
 public Object saveDocument(String doc_name, Document doc)
     throws PersistenceException, SecurityException {
   // save document in datastore
   // SecurityInfo is ingored for SerialDataStore - just pass null
   Document persistDoc = (Document) ds.adopt(doc, null);
   ds.sync(persistDoc);
   if (DEBUG) Out.prln("document saved in datastore...");
   return setDocName(doc_name, persistDoc);
 }
Beispiel #5
0
 @Override
 public void resourceUnloaded(CreoleEvent e) {
   Resource res = e.getResource();
   if (res instanceof Document) {
     Document doc = (Document) res;
     if (DEBUG) Out.prln("resource Unloaded called ");
     // remove from the corpus too, if a transient one
     if (doc.getDataStore() != this.getDataStore()) {
       this.remove(doc);
     } else {
       // unload all occurences
       int index = indexOf(res);
       if (index < 0) return;
       documents.set(index, null);
       if (DEBUG) Out.prln("corpus: document " + index + " unloaded and set to null");
     } // if
   }
 }
 public Object saveCorpus(String corpus_name, Corpus corpus)
     throws PersistenceException, SecurityException {
   // save corpus in datastore
   // SecurityInfo is ingored for SerialDataStore - just pass null
   // a new persisent corpus is returned
   Corpus persistCorp = (Corpus) ds.adopt(corpus, null);
   ds.sync(persistCorp);
   if (DEBUG) Out.prln("corpus saved in datastore...");
   return setCorpusName(corpus_name, persistCorp);
 }
Beispiel #7
0
 /**
  * Every LR that is a CreoleListener (and other Listeners too) must override this method and make
  * sure it removes itself from the objects which it has been listening to. Otherwise, the object
  * will not be released from memory (memory leak!).
  */
 @Override
 public void cleanup() {
   if (DEBUG) Out.prln("serial corpus cleanup called");
   if (corpusListeners != null) corpusListeners = null;
   if (documents != null) documents.clear();
   docDataList.clear();
   Gate.getCreoleRegister().removeCreoleListener(this);
   if (this.dataStore != null) {
     this.dataStore.removeDatastoreListener(this);
   }
 }
Beispiel #8
0
  @Override
  public boolean remove(Object o) {
    if (DEBUG) Out.prln("SerialCorpus:Remove object called");
    if (!(o instanceof Document)) return false;
    Document doc = (Document) o;

    // see if we can find it first. If not, then judt return
    int index = findDocument(doc);
    if (index == -1) return false;

    if (index < docDataList.size()) { // we found it, so remove it
      // by Andrey Shafirin: this part of code can produce an exception
      // if
      // document wasn't loaded
      String docName = docDataList.get(index).getDocumentName();
      Object docPersistentID = getDocumentPersistentID(index);
      docDataList.remove(index);
      // Document oldDoc = (Document) documents.remove(index);
      documents.remove(index);
      // if (DEBUG) Out.prln("documents after remove of " +
      // oldDoc.getName()
      // + " are " + documents);
      if (DEBUG) Out.prln("documents after remove of " + docName + " are " + documents);
      // documentRemoved(oldDoc.getLRPersistenceId().toString());
      if (docPersistentID != null) documentRemoved(docPersistentID.toString());
      // fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
      // oldDoc,
      // index,
      // CorpusEvent.DOCUMENT_REMOVED));
      fireDocumentRemoved(
          new CorpusEvent(
              SerialCorpusImpl.this,
              (Document) o,
              index,
              docPersistentID,
              CorpusEvent.DOCUMENT_REMOVED));
    }

    return true;
  }
Beispiel #9
0
  /** Called by a datastore when a resource has been deleted */
  @Override
  public void resourceDeleted(DatastoreEvent evt) {
    DataStore ds = (DataStore) evt.getSource();
    // 1. check whether this datastore fired the event. If not, return.
    if (!ds.equals(this.dataStore)) return;

    Object docID = evt.getResourceID();
    if (docID == null) return;

    if (DEBUG) Out.prln("Resource deleted called for: " + docID);
    // first check if it is this corpus that's been deleted, it must be
    // unloaded immediately
    if (docID.equals(this.getLRPersistenceId())) {
      Factory.deleteResource(this);
      return;
    } // if

    boolean isDirty = false;
    // the problem here is that I only have the doc persistent ID
    // and nothing else, so I need to determine the index of the doc
    // first
    for (int i = 0; i < docDataList.size(); i++) {
      DocumentData docData = docDataList.get(i);
      // we've found the correct document
      // don't break the loop, because it might appear more than once
      if (docID.equals(docData.getPersistentID())) {
        if (evt.getResource() == null) {
          // instead of calling remove() which tries to load the
          // document
          // remove it from the documents and docDataList
          documentRemoved(docDataList.get(i).persistentID.toString());
          docDataList.remove(i);
          documents.remove(i);
          isDirty = true;
          i--;
          continue;
        }

        remove(i);
        isDirty = true;
      } // if
    } // for loop through the doc data

    if (isDirty)
      try {
        this.dataStore.sync(this);
      } catch (PersistenceException ex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
      } catch (SecurityException sex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
      }
  } // resourceDeleted
Beispiel #10
0
 @Override
 public Document remove(int index) {
   if (DEBUG) Out.prln("Remove index called");
   // try to get the actual document if it was loaded
   Document res = isDocumentLoaded(index) ? get(index) : null;
   Object docLRID = docDataList.get(index).persistentID;
   if (docLRID != null) documentRemoved(docLRID.toString());
   docDataList.remove(index);
   documents.remove(index);
   fireDocumentRemoved(
       new CorpusEvent(SerialCorpusImpl.this, res, index, docLRID, CorpusEvent.DOCUMENT_REMOVED));
   return res;
 }
 public boolean openDataStore(String dir) {
   try {
     dir = getFullFilePath(dir);
     ds = new SerialDataStore(dir);
     ds.open();
     return true;
   } catch (PersistenceException e) {
     e.printStackTrace();
     Out.println(ds.getStorageUrl());
     System.out.println("failed to open datastore!");
     System.out.println("please check if the folder is empty or exist.");
     return false;
   }
 }
  public Corpus getCorpus(Object corpusID) {
    // load corpus from datastore using its persistent ID
    FeatureMap corpFeatures = Factory.newFeatureMap();
    corpFeatures.put(DataStore.LR_ID_FEATURE_NAME, corpusID);
    corpFeatures.put(DataStore.DATASTORE_FEATURE_NAME, ds);

    // tell the factory to load the Serial Corpus with the specified ID
    // from the specified datastore
    try {
      Corpus persistCorp =
          (Corpus) Factory.createResource("gate.corpora.SerialCorpusImpl", corpFeatures);

      if (DEBUG) Out.println("corpus loaded from datastore...");
      return persistCorp;
    } catch (ResourceInstantiationException e) {
      e.printStackTrace();
    }
    return null;
  }
 /** Run ANNIE */
 public void execute() throws GateException {
   Out.prln("Running ANNIE...");
   annieController.execute();
   Out.prln("...ANNIE complete");
 } // execute()
 public void deleteDocument(Object docID) throws PersistenceException {
   // remove document from datastore
   ds.delete("gate.corpora.DocumentImpl", docID);
   if (DEBUG) Out.prln("document deleted from datastore...");
 }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
 public void deleteCorpus(Object corpusID) throws PersistenceException {
   // remove corpus from datastore
   ds.delete("gate.corpora.SerialCorpusImpl", corpusID);
   if (DEBUG) Out.prln("corpus deleted from datastore...");
 }
Beispiel #17
0
  /** This method is called when all characters between specific tags have been read completely */
  public void charactersAction(char[] text, int start, int length) throws SAXException {
    // correction of real offset. Didn't affect on other data.
    super.characters(text, start, length);
    // create a string object based on the reported text
    String content = new String(text, start, length);
    StringBuffer contentBuffer = new StringBuffer("");
    int tmpDocContentSize = tmpDocContent.length();
    boolean incrementStartIndex = false;
    boolean addExtraSpace = true;
    if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) {
      addExtraSpace =
          Gate.getUserConfig()
              .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)
              .booleanValue();
    }
    // If the first char of the text just read "text[0]" is NOT whitespace AND
    // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
    // concatenation "tmpDocContent + content" will result into a new different
    // word... and we want to avoid that, because the tokenizer, gazetter and
    // Jape work on the raw text and concatenating tokens might be not good.
    if (tmpDocContentSize != 0
        && content.length() != 0
        && !Character.isWhitespace(content.charAt(0))
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) {

      // If we are here it means that a concatenation between the last
      // token in the tmpDocContent and the content(which doesn't start
      // with a white space) will be performed. In order to prevent this,
      // we will add a " " space char in order to assure that the 2 tokens
      // stay apart. Howerver we will except from this rule the most known
      // internal entities like &, <, >, etc
      if (( // Testing the length against 1 makes it more likely that
          // an internal entity was called. characters() gets called for
          // each entity separately.
          (content.length() == 1)
              && (content.charAt(0) == '&'
                  || content.charAt(0) == '<'
                  || content.charAt(0) == '>'
                  || content.charAt(0) == '"'
                  || content.charAt(0) == '\''))
          || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '<'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '>'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '"'
              || tmpDocContent.charAt(tmpDocContentSize - 1)
                  == '\'')) { // do nothing. The content will be appended
      } else if (!addExtraSpace) {
      } else {
        // In all other cases append " "
        contentBuffer.append(" ");
        incrementStartIndex = true;
      } // End if
    } // End if

    // put the repositioning information
    if (reposInfo != null) {
      if (!(start == 0 && length == 1 && text.length <= 2)) {
        // normal piece of text
        reposInfo.addPositionInfo(
            getRealOffset(),
            content.length(),
            tmpDocContent.length() + contentBuffer.length(),
            content.length());
        if (DEBUG) {
          Out.println("Info: " + getRealOffset() + ", " + content.length());
          Out.println("Start: " + start + " len" + length);
        } // DEBUG
      } else {
        // unicode char or &xxx; coding
        // Reported from the parser offset is 0
        // The real offset should be found in the ampCodingInfo structure.

        long lastPosition = 0;
        RepositioningInfo.PositionInfo pi;

        if (reposInfo.size() > 0) {
          pi = reposInfo.get(reposInfo.size() - 1);
          lastPosition = pi.getOriginalPosition();
        } // if

        for (int i = 0; i < ampCodingInfo.size(); ++i) {
          pi = ampCodingInfo.get(i);
          if (pi.getOriginalPosition() > lastPosition) {
            // found
            reposInfo.addPositionInfo(
                pi.getOriginalPosition(),
                pi.getOriginalLength(),
                tmpDocContent.length() + contentBuffer.length(),
                content.length());
            break;
          } // if
        } // for
      } // if
    } // if

    // update the document content
    contentBuffer.append(content);
    // calculate the End index for all the elements of the stack
    // the expression is : End index = Current doc length + text length
    Long end = new Long(tmpDocContent.length() + contentBuffer.length());

    CustomObject obj = null;
    // Iterate through stack to modify the End index of the existing elements

    Iterator<CustomObject> anIterator = stack.iterator();
    while (anIterator.hasNext()) {
      // get the object and move to the next one
      obj = anIterator.next();
      if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
        obj.setStart(new Long(obj.getStart().longValue() + 1));
      } // End if
      // sets its End index
      obj.setEnd(end);
    } // End while

    tmpDocContent.append(contentBuffer.toString());
  } // characters();
Beispiel #18
0
 /** This method should only be used by the Serial Datastore to set */
 public void setDocumentPersistentID(int index, Object persID) {
   if (index >= docDataList.size()) return;
   docDataList.get(index).setPersistentID(persID);
   if (DEBUG) Out.prln("IDs are now: " + docDataList);
 }