/** Test the default tokeniser */ public void testHashGazetteer() throws Exception { // get a document Document doc = Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html")); System.out.println(doc.getFeatures().get("gate.SourceURL")); // create a default gazetteer FeatureMap params = Factory.newFeatureMap(); HashGazetteer gaz = (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params); // runtime stuff gaz.setDocument(doc); gaz.setAnnotationSetName(GAZ_AS); gaz.execute(); assertTrue( "the Annotation set resulting of the execution of the OntoText " + "Natural Gazetteer is empty.", !doc.getAnnotations(GAZ_AS).isEmpty()); // check whether the annotations are as expected assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size()); } // testHashGazetteer();
@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
/** * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM * rather than re-loading the lists. */ @Override public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException { return Factory.createResource( SharedDefaultGazetteer.class.getName(), Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this), Factory.duplicate(this.getFeatures(), ctx), this.getName()); }
/** * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui. */ void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN) throws GateException, IOException { LogService.minVerbosityLevel = 0; if (LogService.minVerbosityLevel > 0) System.out.println("Learning Home : " + learningHome.getAbsolutePath()); FeatureMap parameters = Factory.newFeatureMap(); URL configFileURL = new File(configFileName).toURI().toURL(); parameters.put("configFileURL", configFileURL); learningApi = (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters); // Load the corpus corpus = Factory.newCorpus("DataSet"); ExtensionFileFilter fileFilter = new ExtensionFileFilter(); fileFilter.addExtension("xml"); File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter); Arrays.sort( xmlFiles, new Comparator<File>() { public int compare(File a, File b) { return a.getName().compareTo(b.getName()); } }); for (File f : xmlFiles) { if (!f.isDirectory()) { Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8"); doc.setName(f.getName()); corpus.add(doc); } } // URL tempURL = new File(corpusDirName).toURI().toURL(); // corpus.populate(tempURL, fileFilter, "UTF-8", false); // Set the inputAS learningApi.setInputASName(inputasN); learningApi.setOutputASName(outputasN); controller = (gate.creole.SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController"); controller.setCorpus(corpus); controller.add(learningApi); }
private static Document readDocument(String gateDocumentString) throws ResourceInstantiationException { Document gateDocument = (Document) Factory.createResource( "gate.corpora.DocumentImpl", Utils.featureMap( "stringContent", gateDocumentString, "mimeType", "text/xml", "encoding", "UTF-8")); return gateDocument; }
private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc) throws ResourceInstantiationException, IOException { // can't get that to work // File tempDirectory = new // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name", // "./tmp")); // LOG.info("tempDirectory "+tempDirectory); // // tempDirectory.mkdirs(); // // File tempInputFile = File.createTempFile("gateInput-", // inputDoc.getUrl(),tempDirectory); // // FileOutputStream fos = new FileOutputStream(tempInputFile); // OutputStream bout = new BufferedOutputStream(fos); // bout.write(inputDoc.getContent()); // bout.flush(); // bout.close(); // // URL url; // try { // url = tempInputFile.toURI().toURL(); // } catch (MalformedURLException e) { // // delete the input doc // tempInputFile.delete(); // throw e; // } FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent())); String ct = inputDoc.getContentType(); if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct); gate.Document gatedocument; try { gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); } finally { // delete the input doc // tempInputFile.delete(); } return gatedocument; }
public static void main(String[] args) throws Exception { // Logger.getLogger(DocumentFeaturesDiff.class).setLevel(Level.ALL); GateUtils.initGateKeepLog(); GateUtils.registerCzsemPlugin(); ProcessingResource eval = new PRSetup.SinglePRSetup(LearningEvaluator.class) .putFeature("keyASName", ":-)") // .putFeature("responseASName", "lemma_flex") .putFeature("responseASName", "flex") .putFeature("keyAnnotationsAreInDocumentFeatures", true) .putFeatureList("annotationTypes", "Lookup") .putFeatureList("featureNames", "meshID") .createPR(); SerialAnalyserController controller = (SerialAnalyserController) Factory.createResource(SerialAnalyserController.class.getCanonicalName()); controller.add(eval); Corpus corpus = Factory.newCorpus(null); corpus.populate( new File("C:\\Users\\dedek\\Desktop\\bmc\\experiment\\analyzed").toURI().toURL(), // new File("C:\\Users\\dedek\\Desktop\\bmca_devel").toURI().toURL(), null, "utf8", false); System.err.println("populated"); controller.setCorpus(corpus); controller.execute(); }
@Override public Resource init() throws ResourceInstantiationException { gracefulExit = false; if (configFileURL == null) { gracefulExit = true; gate.util.Err.println("No configuration file provided!"); } if (japeURL == null) { gracefulExit = true; gate.util.Err.println("No JAPE grammar file provided!"); } // create the init params for the JAPE transducer FeatureMap params = Factory.newFeatureMap(); params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL); // Code borrowed from Mark Greenwood's Measurements PR if (japeTransducer == null) { // if this is the first time we are running init then actually create a // new transducer as we don't already have one FeatureMap hidden = Factory.newFeatureMap(); Gate.setHiddenAttribute(hidden, true); japeTransducer = (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden); } else { // we are being run through a call to reInit so simply re-init the // underlying JAPE transducer japeTransducer.setParameterValues(params); japeTransducer.reInit(); } ConfigReader config = new ConfigReader(configFileURL); gracefulExit = config.config(); try { HashMap<String, String> options = config.getOptions(); patternMap = new HashMap<String, Pattern>(); addSuffixPattern("disease_suffix", options); addWordPattern("disease_abbrevs", options); addWordPattern("disease_sense", options); addWordExtraPattern("disease_sense_context", options); addPossessiveWordPattern("disease_named_syndrome", options); addWordExtraPattern("disease_generic_context", options); addWordExtraPattern("disease_anatomy_context", options); addSuffixPluralPattern("procedure_suffix", options); addWordPluralPattern("procedure_key", options); addWordExtraPattern("procedure_anatomy_context", options); addWordPluralPattern("symptom_key", options); addWordPattern("test_key", options); addSuffixPattern("anatomy_suffix_adjective", options); addSuffixPattern("anatomy_suffix", options); addPrefixPattern("anatomy_prefix", options); addWordPattern("anatomy_position", options); addWordPluralPattern("anatomy_space_region_junction", options); addWordPattern("anatomy_part_adjective", options); addWordPattern("anatomy_latin_noun", options); addWordPattern("anatomy_muscle", options); addWordPluralPattern("anatomy_part", options); addWordPluralPattern("anatomy_fluid", options); } catch (NullPointerException ne) { gracefulExit = true; gate.util.Err.println( "Missing or unset configuration options. Please check configuration file."); } return this; } // end init()
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main