public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
public static void main(String[] args) throws Exception { String inputFile = inputDirectory + "inputBulgarianSpecial-1.xml"; String gateString = FileUtils.readFileToString(new File(inputFile), "UTF-8"); log.error("Start Pipeline"); Pipeline pl = new Pipeline(); pl.initGate(); Document doc = readDocument(gateString); /* log.error("Add Sentence Detection and Tokenization"); GateOperations go = new GateOperations (); go.addExtraInfo(doc); writeFile(doc.toXml(),"inputEnglish-Proc-1.xml"); */ log.error("Add the Layout Annotation Sets"); Layout layout = new Layout(); layout.addLayout(doc); writeFile(doc.toXml(), "inputBulgarianSpecial-Final-1.xml"); }
/** * The main entry point. First we parse the command line options (see usage() method for details), * then we take all remaining command line parameters to be file names to process. Each file is * loaded, processed using the application and the results written to the output file * (inputFile.out.xml). */ public static void main(String[] args) throws Exception { parseCommandLine(args); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(gappFile); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. ArrayList<String> files = getFilesFromDir(inputDir); gate.Corpus corpus = createCorpus(files); // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); System.out.println("Processing " + files.size() + " files"); // process the files one by one for (int i = 0; i < files.size(); i++) { // load the document (using the specified encoding if one was given) File docFile = new File(files.get(i)); System.out.print("Processing document " + docFile + " (" + i + ") ..."); Document doc = Factory.newDocument(docFile.toURL(), encoding); // put the document in the corpus corpus.add(doc); // run the application application.execute(); // remove the document from the corpus again corpus.clear(); String docXMLString = null; // if we want to just write out specific annotation types, we must // extract the annotations into a Set if (annotTypesToWrite != null) { // Create a temporary Set to hold the annotations we wish to write out Set annotationsToWrite = new HashSet(); // we only extract annotations from the default (unnamed) AnnotationSet // in this example AnnotationSet defaultAnnots = doc.getAnnotations("Output"); Iterator annotTypesIt = annotTypesToWrite.iterator(); while (annotTypesIt.hasNext()) { // extract all the annotations of each requested type and add them to // the temporary set AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next()); if (annotsOfThisType != null) { annotationsToWrite.addAll(annotsOfThisType); } } // create the XML string using these annotations docXMLString = doc.toXml(annotationsToWrite, true); } // otherwise, just write out the whole document as GateXML else { docXMLString = doc.toXml(); } // Release the document, as it is no longer needed Factory.deleteResource(doc); // output the XML to <inputFile>.out.xml System.out.println("Writing file " + docFile.getName()); String outputFileName = docFile.getName() + ".out.xml"; // File outputFile = new File(docFile.getParentFile(), outputFileName); File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName); // Write output files using the same encoding as the original FileOutputStream fos = new FileOutputStream(outputFile); BufferedOutputStream bos = new BufferedOutputStream(fos); OutputStreamWriter out; if (encoding == null) { out = new OutputStreamWriter(bos); } else { out = new OutputStreamWriter(bos, encoding); } out.write(docXMLString); out.close(); System.out.println("done"); } // for each file System.out.println("All done"); } // void main(String[] args)
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main