@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
@SuppressWarnings("unchecked") public Set<String> processDoc(String str) throws Exception { Set<String> toReturn = new HashSet<String>(); Corpus c = null; Document aDoc = null; try { c = Factory.newCorpus("sample"); aDoc = Factory.newDocument(str); c.add(aDoc); controller.setCorpus(c); controller.execute(); AnnotationSet aSet = aDoc.getAnnotations("StockSymbols"); for (Annotation annot : aSet) { String symbol = (String) annot.getFeatures().get("sym"); toReturn.add(symbol); } } catch (Exception e) { throw e; } finally { if (aDoc != null) { Factory.deleteResource(aDoc); } if (c != null) { Factory.deleteResource(c); } } return toReturn; }
/** Test the default tokeniser */ public void testHashGazetteer() throws Exception { // get a document Document doc = Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html")); System.out.println(doc.getFeatures().get("gate.SourceURL")); // create a default gazetteer FeatureMap params = Factory.newFeatureMap(); HashGazetteer gaz = (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params); // runtime stuff gaz.setDocument(doc); gaz.setAnnotationSetName(GAZ_AS); gaz.execute(); assertTrue( "the Annotation set resulting of the execution of the OntoText " + "Natural Gazetteer is empty.", !doc.getAnnotations(GAZ_AS).isEmpty()); // check whether the annotations are as expected assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size()); } // testHashGazetteer();
/** * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM * rather than re-loading the lists. */ @Override public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException { return Factory.createResource( SharedDefaultGazetteer.class.getName(), Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this), Factory.duplicate(this.getFeatures(), ctx), this.getName()); }
/** Clear up the resources used after one test. */ private void clearOneTest() { corpus.clear(); Factory.deleteResource(corpus); Factory.deleteResource(learningApi); controller.remove(learningApi); controller.cleanup(); Factory.deleteResource(controller); }
public JSONObject persian_sentiment(String text) throws Exception { oncreate(); File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp"); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); // process the files one by one // load the document (using the specified encoding if one was given) Document doc = Factory.newDocument(text); // put the document in the corpus corpus.add(doc); // run the application application.execute(); String featureName = "Doc_sentiment"; FeatureMap features = doc.getFeatures(); // remove the document from the corpus again corpus.clear(); // doc.getFeatures(). // Release the document, as it is no longer needed Factory.deleteResource(doc); LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName); String obj = (String) originalContent.get("sentiment"); // BigDecimal pos =(BigDecimal) originalContent.get("positive"); // BigDecimal neg =(BigDecimal) originalContent.get("negative"); // System.out.println(obj); // create Json for response to user JSONObject obj1 = new JSONObject(); obj1.put("sentiment", obj); /*obj1.put("positive",pos); //obj1.put("negative",neg); System.out.print("----------"); System.out.print(obj1); System.out.print("----------");*/ // application.cleanup(); return obj1; }
// generate annotations for ngrams over a larger span e.g all couples inside // a span of 5 tokens // this allows to match more variants e.g. with adjectives in the middle // we do not generate intermediate annotations here // do with only bigrams for the moment private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; // create a temporary list containing all the annotations // at position 0 List<Annotation> headannots = boxes.get(b); for (Annotation newAnn : headannots) { // remembering positions loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String string = (String) newAnn.getFeatures().get(inputAnnotationFeature); tempAnnotationsStartingHere.add(string); if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, string); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } for (int z = 1; z < window && (b + z < boxes.size()); z++) { // generate all possible bi-grams List<Annotation> current = boxes.get(b + z); for (Annotation newAnn : current) { // remembering positions if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // take what is in the buffer // and make a new annotation out of that for (String s : tempAnnotationsStartingHere) { String combination = s + getNgramSeparator() + newString; // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
private void generateNGrams(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { // now do the actual n-grams for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) { // do the combination and dump what we've done at every step // e.g generate 1 grams as well as 2-grams List<Annotation> current = boxes.get(b + z); List<String> temptemp = new ArrayList<String>(); for (Annotation newAnn : current) { // remembering positions if (loStart == null) loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // TODO : what if there is no such value???? if (tempAnnotationsStartingHere.size() == 0) { // create an annotation for the current annotation if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, newString); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } // add it to the temp temptemp.add(newString); } else for (String existing : tempAnnotationsStartingHere) { String combination = existing + getNgramSeparator() + newString; temptemp.add(combination); if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) { // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } tempAnnotationsStartingHere = temptemp; } } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Call the given closure passing this resource as a parameter, and ensuring that the resource is * deleted when the closure returns. This would typically be used in this kind of construction: * * <pre> * Factory.newDocument(someUrl).withResource { * // do something with the document (it) * } * </pre> * * @param self * @param closure * @return the value returned from the closure */ public static <T> T withResource(Resource self, Closure<T> closure) { try { return closure.call(self); } finally { Factory.deleteResource(self); } }
private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID) throws ExecutionException { for (Annotation annot : toTransfer) { Mapping m = mappings.get(annot.getType()); String name = (m == null || m.newName == null ? annot.getType() : m.newName); try { FeatureMap params = Factory.newFeatureMap(); params.putAll(annot.getFeatures()); if (newID) { to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } else { to.add( annot.getId(), annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } } catch (InvalidOffsetException e) { throw new ExecutionException(e); } } }
/** Annotation remove event */ public void annotationRemoved(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); // find out the details which refer to the deleted annotation OffsetDetails od = getOffsetDetails(docID, as.getName(), annot); if (od == null) continue; if (defaultAS) { aDoc.getAnnotations().remove(od.getOriginalAnnotation()); } else { aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation()); } removeOffsetDetails(docID, od); break; } } }
private static gate.Corpus createCorpus(ArrayList<String> files) throws GateException { gate.Corpus corpus = Factory.newCorpus("Transient Gate Corpus"); for (String file : files) { System.out.print("\t " + file); try { corpus.add(Factory.newDocument(new File(file).toURL())); System.out.println(" -- success"); } catch (gate.creole.ResourceInstantiationException e) { System.out.println(" -- failed (" + e.getMessage() + ")"); } catch (Exception e) { System.out.println(" -- " + e.getMessage()); } } return corpus; }
public void tokenize() { AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) { Annotation currentTokenAnnotation = it.next(); FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures(); FeatureMap curFeaturesMap = Factory.newFeatureMap(); if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) { curFeaturesMap.put("string", tokenFeaturesMap.get("string")); curFeaturesMap.put("root", tokenFeaturesMap.get("lemma")); curFeaturesMap.put("category", tokenFeaturesMap.get("POS")); // Add the new Token to the Annotation Set defaultAs.add( currentTokenAnnotation.getStartNode(), currentTokenAnnotation.getEndNode(), currentTokenAnnotation.getType(), curFeaturesMap); } } gateDocument.removeAnnotationSet("Tokenization"); }
/** * You should never create instances of this class directly, you should create new relations via * the appropriate methods of {@link RelationSet}. This method is only publicly available to * support persistence. */ public SimpleRelation(int id, String type, int[] members) { super(); this.id = id; this.type = type; this.members = members; features = Factory.newFeatureMap(); }
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
@Override public void execute() throws ExecutionException { initBeforeExecute(); AnnotationSet tokensAndDependenciesAS = inputAS; TreeIndex index = new GateAwareTreeIndex( tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"}))); QueryData data = new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS)); Iterable<QueryMatch> results = queryObject.evaluate(data); int queryMatchOrd = 0; for (QueryMatch result : results) { queryMatchOrd++; for (NodeMatch match : result.getMatchingNodes()) { String name = match.getQueryNode().getName(); if (name != null) { Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId()); FeatureMap fm = Factory.newFeatureMap(); fm.put("matchingNodeId", match.getNodeId()); fm.put( "queryMatchId", String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd)); outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm); } } } }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(outputASName); List<Annotation> tokens = new ArrayList<Annotation>( document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE)); Collections.sort(tokens, new OffsetComparator()); String[] strings = new String[tokens.size()]; for (int i = 0; i < tokens.size(); ++i) { strings[i] = (String) tokens.get(i).getFeatures().get("string"); } try { TagList tags = tagger.tag(strings); Iterator<Tag> it = tags.iterator(); while (it.hasNext()) { Tag tag = it.next(); outputAS.add( tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(), tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(), tag.getTagname(), Factory.newFeatureMap()); } } catch (Exception ioe) { throw new ExecutionException("Tagger Failed", ioe); } }
@Override public void datastoreClosed(CreoleEvent e) { if (!e.getDatastore().equals(this.getDataStore())) return; if (this.getDataStore() != null) this.getDataStore().removeDatastoreListener(this); // close this corpus, since it cannot stay open when the DS it comes // from // is closed Factory.deleteResource(this); }
private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc) throws ResourceInstantiationException, IOException { // can't get that to work // File tempDirectory = new // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name", // "./tmp")); // LOG.info("tempDirectory "+tempDirectory); // // tempDirectory.mkdirs(); // // File tempInputFile = File.createTempFile("gateInput-", // inputDoc.getUrl(),tempDirectory); // // FileOutputStream fos = new FileOutputStream(tempInputFile); // OutputStream bout = new BufferedOutputStream(fos); // bout.write(inputDoc.getContent()); // bout.flush(); // bout.close(); // // URL url; // try { // url = tempInputFile.toURI().toURL(); // } catch (MalformedURLException e) { // // delete the input doc // tempInputFile.delete(); // throw e; // } FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent())); String ct = inputDoc.getContentType(); if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct); gate.Document gatedocument; try { gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); } finally { // delete the input doc // tempInputFile.delete(); } return gatedocument; }
/** * Call the closure once for each document in this corpus, loading and unloading documents as * appropriate in the case of a persistent corpus. * * @param self the corpus to traverse * @param closure the closure to call * @return the corpus. */ public static <T> Object each(Corpus self, Closure<T> closure) { for (int i = 0; i < self.size(); i++) { boolean docWasLoaded = self.isDocumentLoaded(i); Document doc = self.get(i); closure.call(doc); if (!docWasLoaded) { self.unloadDocument(doc); Factory.deleteResource(doc); } } return self; }
/** * Call the closure once for each document in this corpus, loading and unloading documents as * appropriate in the case of a persistent corpus, and adding the return values of each call to * the given collection. * * @param self the corpus to traverse * @param closure the closure to call * @return a list of the return values from each closure call. */ public static <T> Collection<T> collect(Corpus self, Collection<T> coll, Closure<T> closure) { for (int i = 0; i < self.size(); i++) { boolean docWasLoaded = self.isDocumentLoaded(i); Document doc = self.get(i); coll.add(closure.call(doc)); if (!docWasLoaded) { self.unloadDocument(doc); Factory.deleteResource(doc); } } return coll; }
/** * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui. */ void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN) throws GateException, IOException { LogService.minVerbosityLevel = 0; if (LogService.minVerbosityLevel > 0) System.out.println("Learning Home : " + learningHome.getAbsolutePath()); FeatureMap parameters = Factory.newFeatureMap(); URL configFileURL = new File(configFileName).toURI().toURL(); parameters.put("configFileURL", configFileURL); learningApi = (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters); // Load the corpus corpus = Factory.newCorpus("DataSet"); ExtensionFileFilter fileFilter = new ExtensionFileFilter(); fileFilter.addExtension("xml"); File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter); Arrays.sort( xmlFiles, new Comparator<File>() { public int compare(File a, File b) { return a.getName().compareTo(b.getName()); } }); for (File f : xmlFiles) { if (!f.isDirectory()) { Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8"); doc.setName(f.getName()); corpus.add(doc); } } // URL tempURL = new File(corpusDirName).toURI().toURL(); // corpus.populate(tempURL, fileFilter, "UTF-8", false); // Set the inputAS learningApi.setInputASName(inputasN); learningApi.setOutputASName(outputasN); controller = (gate.creole.SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController"); controller.setCorpus(corpus); controller.add(learningApi); }
/** Called by a datastore when a resource has been deleted */ @Override public void resourceDeleted(DatastoreEvent evt) { DataStore ds = (DataStore) evt.getSource(); // 1. check whether this datastore fired the event. If not, return. if (!ds.equals(this.dataStore)) return; Object docID = evt.getResourceID(); if (docID == null) return; if (DEBUG) Out.prln("Resource deleted called for: " + docID); // first check if it is this corpus that's been deleted, it must be // unloaded immediately if (docID.equals(this.getLRPersistenceId())) { Factory.deleteResource(this); return; } // if boolean isDirty = false; // the problem here is that I only have the doc persistent ID // and nothing else, so I need to determine the index of the doc // first for (int i = 0; i < docDataList.size(); i++) { DocumentData docData = docDataList.get(i); // we've found the correct document // don't break the loop, because it might appear more than once if (docID.equals(docData.getPersistentID())) { if (evt.getResource() == null) { // instead of calling remove() which tries to load the // document // remove it from the documents and docDataList documentRemoved(docDataList.get(i).persistentID.toString()); docDataList.remove(i); documents.remove(i); isDirty = true; i--; continue; } remove(i); isDirty = true; } // if } // for loop through the doc data if (isDirty) try { this.dataStore.sync(this); } catch (PersistenceException ex) { throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage()); } catch (SecurityException sex) { throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage()); } } // resourceDeleted
public static void initController() throws PersistenceException, ResourceInstantiationException, IOException { GateUtils.deleteAllPublicGateResources(); controller = (ConditionalSerialAnalyserController) PersistenceManager.loadObjectFromFile( new File("C:/Users/dedek/Desktop/DATLOWE/gate_apps/all.gapp")); controller.add(ie.getPR()); corpus = Factory.newCorpus("SpcCorp"); controller.setCorpus(corpus); }
private static Document readDocument(String gateDocumentString) throws ResourceInstantiationException { Document gateDocument = (Document) Factory.createResource( "gate.corpora.DocumentImpl", Utils.featureMap( "stringContent", gateDocumentString, "mimeType", "text/xml", "encoding", "UTF-8")); return gateDocument; }
public static void main(String[] args) throws Exception { // Logger.getLogger(DocumentFeaturesDiff.class).setLevel(Level.ALL); GateUtils.initGateKeepLog(); GateUtils.registerCzsemPlugin(); ProcessingResource eval = new PRSetup.SinglePRSetup(LearningEvaluator.class) .putFeature("keyASName", ":-)") // .putFeature("responseASName", "lemma_flex") .putFeature("responseASName", "flex") .putFeature("keyAnnotationsAreInDocumentFeatures", true) .putFeatureList("annotationTypes", "Lookup") .putFeatureList("featureNames", "meshID") .createPR(); SerialAnalyserController controller = (SerialAnalyserController) Factory.createResource(SerialAnalyserController.class.getCanonicalName()); controller.add(eval); Corpus corpus = Factory.newCorpus(null); corpus.populate( new File("C:\\Users\\dedek\\Desktop\\bmc\\experiment\\analyzed").toURI().toURL(), // new File("C:\\Users\\dedek\\Desktop\\bmca_devel").toURI().toURL(), null, "utf8", false); System.err.println("populated"); controller.setCorpus(corpus); controller.execute(); }
/** * This method is called when the HTML parser encounts the beginning of a tag that means that the * tag is paired by an end tag and it's not an empty one. */ @Override public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { // Fire the status listener if the elements processed exceded the rate if (0 == (++elements % ELEMENTS_RATE)) fireStatusChangedEvent("Processed elements : " + elements); // Start of STYLE tag if (HTML.Tag.STYLE.equals(t)) { isInsideStyleTag = true; } // if // Construct a feature map from the attributes list FeatureMap fm = Factory.newFeatureMap(); // Take all the attributes an put them into the feature map if (0 != a.getAttributeCount()) { Enumeration<?> enumeration = a.getAttributeNames(); while (enumeration.hasMoreElements()) { Object attribute = enumeration.nextElement(); fm.put(attribute.toString(), (a.getAttribute(attribute)).toString()); } // while } // if // Just analize the tag t and add some\n chars and spaces to the // tmpDocContent.The reason behind is that we need to have a readable form // for the final document. customizeAppearanceOfDocumentWithStartTag(t); // If until here the "tmpDocContent" ends with a NON whitespace char, // then we add a space char before calculating the START index of this // tag. // This is done in order not to concatenate the content of two separate tags // and obtain a different NEW word. int tmpDocContentSize = tmpDocContent.length(); if (tmpDocContentSize != 0 && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) tmpDocContent.append(" "); // create the start index of the annotation Long startIndex = new Long(tmpDocContent.length()); // initialy the start index is equal with the End index CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex); // put it into the stack stack.push(obj); } // handleStartTag
/** Annotation added event */ public void annotationAdded(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); annot.addAnnotationListener(this); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue()); if (stOffset == -1) continue; long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue()); if (enOffset == -1) continue; Annotation originalAnnot = null; try { Integer id = annot.getId(); if (defaultAS) { aDoc.getAnnotations() .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations().get(id); } else { aDoc.getAnnotations(as.getName()) .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations(as.getName()).get(id); } } catch (InvalidOffsetException ioe) { System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset); throw new GateRuntimeException(ioe); } OffsetDetails od = new OffsetDetails(); od.setOldStartOffset(stOffset); od.setOldEndOffset(enOffset); od.setNewStartOffset(annot.getStartNode().getOffset().longValue()); od.setNewEndOffset(annot.getEndNode().getOffset().longValue()); od.setOriginalAnnotation(originalAnnot); od.setNewAnnotation(annot); addNewOffsetDetails(docID, od); break; } } }
public void setConf(Configuration conf) { config = conf; if (applicationDescriptorPath == null) throw new RuntimeException("GATE application path is null"); // create one instance of the GATE application // need to avoid concurrent access to the application try { if (inited == false) { File gateHome = new File(applicationDescriptorPath.getFile()).getParentFile(); LOG.info("Setting GATE_HOME as " + gateHome); File pluginsHome = new File(gateHome, "plugins"); // the config files are in the job archive - not in the GATE // application // zip // File siteConfigFile = new File(conf // .getResource("site-gate.xml").getFile()); // File userConfig = new File(conf.getResource("user-gate.xml") // .getFile()); Gate.runInSandbox(true); Gate.setGateHome(gateHome); Gate.setPluginsHome(pluginsHome); // Gate.setSiteConfigFile(siteConfigFile); // Gate.setUserConfigFile(userConfig); // the builtInCreoleDir files // are stored in the same place as the config ones // Gate.setBuiltinCreoleDir(conf.getResource("creole.xml")); Gate.init(); inited = true; } corpus = Factory.newCorpus("DummyCorpus"); this.GATEapplication = (CorpusController) PersistenceManager.loadObjectFromUrl(applicationDescriptorPath); // load the annotation and feature filters from the configuration this.filters = GATEAnnotationFilters.getFilters(config); } catch (Exception e) { LOG.error("Encountered error while initialising GATE", e); throw new RuntimeException(e); } }