/** Returns a list of annotations to be added to the Behemoth document from the GATE one * */ private List<com.digitalpebble.behemoth.Annotation> convertGATEAnnotationsToBehemoth( AnnotationSet GATEAnnotionSet, com.digitalpebble.behemoth.BehemothDocument behemoth) { List<com.digitalpebble.behemoth.Annotation> beheannotations = new ArrayList<com.digitalpebble.behemoth.Annotation>(); AnnotationSet resultAS = GATEAnnotionSet.get(filters.getTypes()); // sort the GATE annotations List<gate.Annotation> annotationList = new ArrayList<gate.Annotation>(resultAS); Collections.sort(annotationList, new OffsetComparator()); Iterator<gate.Annotation> inputASIter = annotationList.iterator(); while (inputASIter.hasNext()) { gate.Annotation source = inputASIter.next(); com.digitalpebble.behemoth.Annotation target = new com.digitalpebble.behemoth.Annotation(); target.setType(source.getType()); target.setStart(source.getStartNode().getOffset().longValue()); target.setEnd(source.getEndNode().getOffset().longValue()); // now do the features // is the type listed? Set<String> expectedfeatnames = filters.getFeatureFilters().get(source.getType()); if (expectedfeatnames != null) { Iterator featurenames = source.getFeatures().keySet().iterator(); while (featurenames.hasNext()) { // cast the feature name to a string which will be right in // 99% of cases String featurename = featurenames.next().toString(); // if this feature name is not wanted just ignore it if (expectedfeatnames.contains(featurename) == false) continue; // we know that we want to keep this feature // let's see what the best way of representing the value // would be // TODO later => find a better way of mapping when not a // string Object originalvalue = source.getFeatures().get(featurename); if (originalvalue == null) originalvalue = "null"; target.getFeatures().put(featurename, originalvalue.toString()); } } beheannotations.add(target); } return beheannotations; }
public void setConf(Configuration conf) { config = conf; if (applicationDescriptorPath == null) throw new RuntimeException("GATE application path is null"); // create one instance of the GATE application // need to avoid concurrent access to the application try { if (inited == false) { File gateHome = new File(applicationDescriptorPath.getFile()).getParentFile(); LOG.info("Setting GATE_HOME as " + gateHome); File pluginsHome = new File(gateHome, "plugins"); // the config files are in the job archive - not in the GATE // application // zip // File siteConfigFile = new File(conf // .getResource("site-gate.xml").getFile()); // File userConfig = new File(conf.getResource("user-gate.xml") // .getFile()); Gate.runInSandbox(true); Gate.setGateHome(gateHome); Gate.setPluginsHome(pluginsHome); // Gate.setSiteConfigFile(siteConfigFile); // Gate.setUserConfigFile(userConfig); // the builtInCreoleDir files // are stored in the same place as the config ones // Gate.setBuiltinCreoleDir(conf.getResource("creole.xml")); Gate.init(); inited = true; } corpus = Factory.newCorpus("DummyCorpus"); this.GATEapplication = (CorpusController) PersistenceManager.loadObjectFromUrl(applicationDescriptorPath); // load the annotation and feature filters from the configuration this.filters = GATEAnnotationFilters.getFilters(config); } catch (Exception e) { LOG.error("Encountered error while initialising GATE", e); throw new RuntimeException(e); } }
// Process an input document with GATE and a Reporter public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); AnnotationSet annots = null; if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations(); else annots = gatedocument.getAnnotations(filters.getAnnotationSetName()); // enrich the input doc with the annotations from // the GATE application // transfer the annotations from the GATE document // to the Behemoth one using the filters List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots, inputDoc); // sort the annotations before adding them? Collections.sort(beheannotations); // clear the existing behemoth annotations if (clearBehemothAnnotations) { inputDoc.getAnnotations().clear(); } inputDoc.getAnnotations().addAll(beheannotations); // add counters about num of annotations added if (reporter != null) for (com.digitalpebble.behemoth.Annotation annot : beheannotations) { reporter.incrCounter("GATE", annot.getType(), 1); } // Add the document features from GATE to Behemoth Set<String> docFeatFilter = this.filters.getDocFeaturesFilter(); MapWritable beheMD = inputDoc.getMetadata(true); if (docFeatFilter.size() > 0) { for (String docFeatName : docFeatFilter) { Object featValue = gatedocument.getFeatures().get(docFeatName); if (featValue != null) { beheMD.put(new Text(docFeatName), new Text(featValue.toString())); } } } if (reporter != null) reporter.incrCounter("GATE", "Document", 1); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } // currently returns only the input document return new BehemothDocument[] {inputDoc}; }