@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
/** * Unloads a document from memory * * @param doc the document to be unloaded * @param sync should the document be sync'ed (i.e. saved) before unloading. */ public void unloadDocument(Document doc, boolean sync) { if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName()); // 1. determine the index of the document; if not there, do nothing int index = findDocument(doc); if (index == -1) return; if (DEBUG) Out.prln("Index of doc: " + index); if (DEBUG) Out.prln("Size of corpus: " + documents.size()); unloadDocument(index, sync); // documents.remove(new Integer(index)); }
/** * Initialise the ANNIE system. This creates a "corpus pipeline" application that can be used to * run sets of documents through the extraction system. */ public void initAnnie() throws GateException, IOException { Out.prln("Initialising ANNIE..."); // load the ANNIE application from the saved state in plugins/ANNIE File pluginsHome = Gate.getPluginsHome(); File anniePlugin = new File(pluginsHome, "ANNIE"); File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp"); annieController = (CorpusController) PersistenceManager.loadObjectFromFile(annieGapp); Out.prln("...ANNIE loaded"); } // initAnnie()
public Object saveDocument(String doc_name, Document doc) throws PersistenceException, SecurityException { // save document in datastore // SecurityInfo is ingored for SerialDataStore - just pass null Document persistDoc = (Document) ds.adopt(doc, null); ds.sync(persistDoc); if (DEBUG) Out.prln("document saved in datastore..."); return setDocName(doc_name, persistDoc); }
@Override public void resourceUnloaded(CreoleEvent e) { Resource res = e.getResource(); if (res instanceof Document) { Document doc = (Document) res; if (DEBUG) Out.prln("resource Unloaded called "); // remove from the corpus too, if a transient one if (doc.getDataStore() != this.getDataStore()) { this.remove(doc); } else { // unload all occurences int index = indexOf(res); if (index < 0) return; documents.set(index, null); if (DEBUG) Out.prln("corpus: document " + index + " unloaded and set to null"); } // if } }
public Object saveCorpus(String corpus_name, Corpus corpus) throws PersistenceException, SecurityException { // save corpus in datastore // SecurityInfo is ingored for SerialDataStore - just pass null // a new persisent corpus is returned Corpus persistCorp = (Corpus) ds.adopt(corpus, null); ds.sync(persistCorp); if (DEBUG) Out.prln("corpus saved in datastore..."); return setCorpusName(corpus_name, persistCorp); }
/** * Every LR that is a CreoleListener (and other Listeners too) must override this method and make * sure it removes itself from the objects which it has been listening to. Otherwise, the object * will not be released from memory (memory leak!). */ @Override public void cleanup() { if (DEBUG) Out.prln("serial corpus cleanup called"); if (corpusListeners != null) corpusListeners = null; if (documents != null) documents.clear(); docDataList.clear(); Gate.getCreoleRegister().removeCreoleListener(this); if (this.dataStore != null) { this.dataStore.removeDatastoreListener(this); } }
@Override public boolean remove(Object o) { if (DEBUG) Out.prln("SerialCorpus:Remove object called"); if (!(o instanceof Document)) return false; Document doc = (Document) o; // see if we can find it first. If not, then judt return int index = findDocument(doc); if (index == -1) return false; if (index < docDataList.size()) { // we found it, so remove it // by Andrey Shafirin: this part of code can produce an exception // if // document wasn't loaded String docName = docDataList.get(index).getDocumentName(); Object docPersistentID = getDocumentPersistentID(index); docDataList.remove(index); // Document oldDoc = (Document) documents.remove(index); documents.remove(index); // if (DEBUG) Out.prln("documents after remove of " + // oldDoc.getName() // + " are " + documents); if (DEBUG) Out.prln("documents after remove of " + docName + " are " + documents); // documentRemoved(oldDoc.getLRPersistenceId().toString()); if (docPersistentID != null) documentRemoved(docPersistentID.toString()); // fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, // oldDoc, // index, // CorpusEvent.DOCUMENT_REMOVED)); fireDocumentRemoved( new CorpusEvent( SerialCorpusImpl.this, (Document) o, index, docPersistentID, CorpusEvent.DOCUMENT_REMOVED)); } return true; }
/** Called by a datastore when a resource has been deleted */ @Override public void resourceDeleted(DatastoreEvent evt) { DataStore ds = (DataStore) evt.getSource(); // 1. check whether this datastore fired the event. If not, return. if (!ds.equals(this.dataStore)) return; Object docID = evt.getResourceID(); if (docID == null) return; if (DEBUG) Out.prln("Resource deleted called for: " + docID); // first check if it is this corpus that's been deleted, it must be // unloaded immediately if (docID.equals(this.getLRPersistenceId())) { Factory.deleteResource(this); return; } // if boolean isDirty = false; // the problem here is that I only have the doc persistent ID // and nothing else, so I need to determine the index of the doc // first for (int i = 0; i < docDataList.size(); i++) { DocumentData docData = docDataList.get(i); // we've found the correct document // don't break the loop, because it might appear more than once if (docID.equals(docData.getPersistentID())) { if (evt.getResource() == null) { // instead of calling remove() which tries to load the // document // remove it from the documents and docDataList documentRemoved(docDataList.get(i).persistentID.toString()); docDataList.remove(i); documents.remove(i); isDirty = true; i--; continue; } remove(i); isDirty = true; } // if } // for loop through the doc data if (isDirty) try { this.dataStore.sync(this); } catch (PersistenceException ex) { throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage()); } catch (SecurityException sex) { throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage()); } } // resourceDeleted
@Override public Document remove(int index) { if (DEBUG) Out.prln("Remove index called"); // try to get the actual document if it was loaded Document res = isDocumentLoaded(index) ? get(index) : null; Object docLRID = docDataList.get(index).persistentID; if (docLRID != null) documentRemoved(docLRID.toString()); docDataList.remove(index); documents.remove(index); fireDocumentRemoved( new CorpusEvent(SerialCorpusImpl.this, res, index, docLRID, CorpusEvent.DOCUMENT_REMOVED)); return res; }
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main
/** Run ANNIE */ public void execute() throws GateException { Out.prln("Running ANNIE..."); annieController.execute(); Out.prln("...ANNIE complete"); } // execute()
public void deleteDocument(Object docID) throws PersistenceException { // remove document from datastore ds.delete("gate.corpora.DocumentImpl", docID); if (DEBUG) Out.prln("document deleted from datastore..."); }
public void deleteCorpus(Object corpusID) throws PersistenceException { // remove corpus from datastore ds.delete("gate.corpora.SerialCorpusImpl", corpusID); if (DEBUG) Out.prln("corpus deleted from datastore..."); }
/** This method should only be used by the Serial Datastore to set */ public void setDocumentPersistentID(int index, Object persID) { if (index >= docDataList.size()) return; docDataList.get(index).setPersistentID(persID); if (DEBUG) Out.prln("IDs are now: " + docDataList); }