@SuppressWarnings("unchecked") public Set<String> processDoc(String str) throws Exception { Set<String> toReturn = new HashSet<String>(); Corpus c = null; Document aDoc = null; try { c = Factory.newCorpus("sample"); aDoc = Factory.newDocument(str); c.add(aDoc); controller.setCorpus(c); controller.execute(); AnnotationSet aSet = aDoc.getAnnotations("StockSymbols"); for (Annotation annot : aSet) { String symbol = (String) annot.getFeatures().get("sym"); toReturn.add(symbol); } } catch (Exception e) { throw e; } finally { if (aDoc != null) { Factory.deleteResource(aDoc); } if (c != null) { Factory.deleteResource(c); } } return toReturn; }
/** Clear up the resources used after one test. */ private void clearOneTest() { corpus.clear(); Factory.deleteResource(corpus); Factory.deleteResource(learningApi); controller.remove(learningApi); controller.cleanup(); Factory.deleteResource(controller); }
/** * Call the given closure passing this resource as a parameter, and ensuring that the resource is * deleted when the closure returns. This would typically be used in this kind of construction: * * <pre> * Factory.newDocument(someUrl).withResource { * // do something with the document (it) * } * </pre> * * @param self * @param closure * @return the value returned from the closure */ public static <T> T withResource(Resource self, Closure<T> closure) { try { return closure.call(self); } finally { Factory.deleteResource(self); } }
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
public JSONObject persian_sentiment(String text) throws Exception { oncreate(); File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp"); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); // process the files one by one // load the document (using the specified encoding if one was given) Document doc = Factory.newDocument(text); // put the document in the corpus corpus.add(doc); // run the application application.execute(); String featureName = "Doc_sentiment"; FeatureMap features = doc.getFeatures(); // remove the document from the corpus again corpus.clear(); // doc.getFeatures(). // Release the document, as it is no longer needed Factory.deleteResource(doc); LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName); String obj = (String) originalContent.get("sentiment"); // BigDecimal pos =(BigDecimal) originalContent.get("positive"); // BigDecimal neg =(BigDecimal) originalContent.get("negative"); // System.out.println(obj); // create Json for response to user JSONObject obj1 = new JSONObject(); obj1.put("sentiment", obj); /*obj1.put("positive",pos); //obj1.put("negative",neg); System.out.print("----------"); System.out.print(obj1); System.out.print("----------");*/ // application.cleanup(); return obj1; }
@Override public void datastoreClosed(CreoleEvent e) { if (!e.getDatastore().equals(this.getDataStore())) return; if (this.getDataStore() != null) this.getDataStore().removeDatastoreListener(this); // close this corpus, since it cannot stay open when the DS it comes // from // is closed Factory.deleteResource(this); }
/** * Call the closure once for each document in this corpus, loading and unloading documents as * appropriate in the case of a persistent corpus, and adding the return values of each call to * the given collection. * * @param self the corpus to traverse * @param closure the closure to call * @return a list of the return values from each closure call. */ public static <T> Collection<T> collect(Corpus self, Collection<T> coll, Closure<T> closure) { for (int i = 0; i < self.size(); i++) { boolean docWasLoaded = self.isDocumentLoaded(i); Document doc = self.get(i); coll.add(closure.call(doc)); if (!docWasLoaded) { self.unloadDocument(doc); Factory.deleteResource(doc); } } return coll; }
/** * Call the closure once for each document in this corpus, loading and unloading documents as * appropriate in the case of a persistent corpus. * * @param self the corpus to traverse * @param closure the closure to call * @return the corpus. */ public static <T> Object each(Corpus self, Closure<T> closure) { for (int i = 0; i < self.size(); i++) { boolean docWasLoaded = self.isDocumentLoaded(i); Document doc = self.get(i); closure.call(doc); if (!docWasLoaded) { self.unloadDocument(doc); Factory.deleteResource(doc); } } return self; }
/** Called by a datastore when a resource has been deleted */ @Override public void resourceDeleted(DatastoreEvent evt) { DataStore ds = (DataStore) evt.getSource(); // 1. check whether this datastore fired the event. If not, return. if (!ds.equals(this.dataStore)) return; Object docID = evt.getResourceID(); if (docID == null) return; if (DEBUG) Out.prln("Resource deleted called for: " + docID); // first check if it is this corpus that's been deleted, it must be // unloaded immediately if (docID.equals(this.getLRPersistenceId())) { Factory.deleteResource(this); return; } // if boolean isDirty = false; // the problem here is that I only have the doc persistent ID // and nothing else, so I need to determine the index of the doc // first for (int i = 0; i < docDataList.size(); i++) { DocumentData docData = docDataList.get(i); // we've found the correct document // don't break the loop, because it might appear more than once if (docID.equals(docData.getPersistentID())) { if (evt.getResource() == null) { // instead of calling remove() which tries to load the // document // remove it from the documents and docDataList documentRemoved(docDataList.get(i).persistentID.toString()); docDataList.remove(i); documents.remove(i); isDirty = true; i--; continue; } remove(i); isDirty = true; } // if } // for loop through the doc data if (isDirty) try { this.dataStore.sync(this); } catch (PersistenceException ex) { throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage()); } catch (SecurityException sex) { throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage()); } } // resourceDeleted
public void actionPerformed(ActionEvent e) { int[] rowsTable = docTable.getSelectedRows(); int[] rowsCorpus = new int[rowsTable.length]; for (int i = 0; i < rowsTable.length; i++) rowsCorpus[i] = docTable.rowViewToModel(rowsTable[i]); Arrays.sort(rowsCorpus); // starting from the largest one, move each element down for (int i = rowsCorpus.length - 1; i >= 0; i--) { if (rowsCorpus[i] < corpus.size() - 1) { // swap the doc with the one before // serial corpus does not load the document on remove, so we need // to load the document explicitly boolean wasLoaded = corpus.isDocumentLoaded(rowsCorpus[i]); Document doc = (Document) corpus.get(rowsCorpus[i]); corpus.remove(rowsCorpus[i]); rowsCorpus[i]++; corpus.add(rowsCorpus[i], doc); if (!wasLoaded) { corpus.unloadDocument(doc); Factory.deleteResource(doc); } } } // restore selection // the remove / add events will cause the table to be updated // we need to only restore the selection after that happened final int[] selectedRowsCorpus = new int[rowsCorpus.length]; System.arraycopy(rowsCorpus, 0, selectedRowsCorpus, 0, rowsCorpus.length); SwingUtilities.invokeLater( new Runnable() { public void run() { docTable.clearSelection(); for (int i = 0; i < selectedRowsCorpus.length; i++) { int rowTable = docTable.rowModelToView(selectedRowsCorpus[i]); docTable.getSelectionModel().addSelectionInterval(rowTable, rowTable); } } }); }
/** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { /* Parse command line arguments */ Getopt g = new Getopt("gateExtractor", args, "i:r:te"); g.setOpterr(false); String inputPath = ""; String outputPath = ""; boolean train = false; boolean eval = false; boolean run = false; int c; String arg; while ((c = g.getopt()) != -1) { switch (c) { case 'i': arg = g.getOptarg(); if (arg == null || arg.isEmpty()) { usage("Please provide an input path"); } inputPath = arg; break; case 'r': run = true; arg = g.getOptarg(); if (arg == null || arg.isEmpty()) { usage("Please provide an output path"); } outputPath = arg; break; case 't': train = true; break; case 'e': eval = true; break; case '?': default: usage(null); } } if (args.length == 0 || (!run && !train && !eval)) { usage("Nothing to do."); } if (inputPath == null || inputPath.isEmpty()) { usage("Please provide an input path"); } if (run && (outputPath == null || outputPath.isEmpty())) { usage("Please provide an output directory!"); } if (train && eval) { usage("Only one mode allowed at a time"); } if (train && run) { usage("Only one mode allowed at a time"); } if (eval && run) { usage("Only one mode allowed at a time"); } /* Initialize GATE */ String location = new File(Main.class.getProtectionDomain().getCodeSource().getLocation().getPath()) .getParent(); String resourcesFolder = location + "/resources"; Gate.setGateHome(new File(resourcesFolder)); /* Create ml-config.xml with threads */ createConfig(resourcesFolder + File.separator); Gate.init(); /* Load Corpus */ log.info("Loading Corpus ... "); Corpus corpus = Factory.newCorpus("Training Corpus"); File directory = new File(inputPath); URL url = directory.toURI().toURL(); corpus.populate(url, null, null, true); log.info("Done loading Corpus!"); Pipeline pipeline = null; /* Do Tagging */ pipeline = new Tagger(); pipeline.run(corpus, resourcesFolder); /* Train */ if (train) { pipeline = new Trainer(); pipeline.run(corpus, resourcesFolder); } /* Apply learned rules */ if (run) { pipeline = new Extractor(); pipeline.run(corpus, resourcesFolder); ExecutorService executorService = Executors.newFixedThreadPool(20); for (int i = 0; i < corpus.size(); i++) { executorService.execute(new OutputGenerator(outputPath, corpus.get(i))); } executorService.shutdown(); executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); } /* Evaluate results */ if (eval) { pipeline = new Evaluator(); pipeline.run(corpus, resourcesFolder); } /* Clean up */ Factory.deleteResource(corpus); outputFile_mlConfigThreads.delete(); }
@Override public void cleanup() { Factory.deleteResource(japeTransducer); }
/** * The main entry point. First we parse the command line options (see usage() method for details), * then we take all remaining command line parameters to be file names to process. Each file is * loaded, processed using the application and the results written to the output file * (inputFile.out.xml). */ public static void main(String[] args) throws Exception { parseCommandLine(args); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(gappFile); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. ArrayList<String> files = getFilesFromDir(inputDir); gate.Corpus corpus = createCorpus(files); // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); System.out.println("Processing " + files.size() + " files"); // process the files one by one for (int i = 0; i < files.size(); i++) { // load the document (using the specified encoding if one was given) File docFile = new File(files.get(i)); System.out.print("Processing document " + docFile + " (" + i + ") ..."); Document doc = Factory.newDocument(docFile.toURL(), encoding); // put the document in the corpus corpus.add(doc); // run the application application.execute(); // remove the document from the corpus again corpus.clear(); String docXMLString = null; // if we want to just write out specific annotation types, we must // extract the annotations into a Set if (annotTypesToWrite != null) { // Create a temporary Set to hold the annotations we wish to write out Set annotationsToWrite = new HashSet(); // we only extract annotations from the default (unnamed) AnnotationSet // in this example AnnotationSet defaultAnnots = doc.getAnnotations("Output"); Iterator annotTypesIt = annotTypesToWrite.iterator(); while (annotTypesIt.hasNext()) { // extract all the annotations of each requested type and add them to // the temporary set AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next()); if (annotsOfThisType != null) { annotationsToWrite.addAll(annotsOfThisType); } } // create the XML string using these annotations docXMLString = doc.toXml(annotationsToWrite, true); } // otherwise, just write out the whole document as GateXML else { docXMLString = doc.toXml(); } // Release the document, as it is no longer needed Factory.deleteResource(doc); // output the XML to <inputFile>.out.xml System.out.println("Writing file " + docFile.getName()); String outputFileName = docFile.getName() + ".out.xml"; // File outputFile = new File(docFile.getParentFile(), outputFileName); File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName); // Write output files using the same encoding as the original FileOutputStream fos = new FileOutputStream(outputFile); BufferedOutputStream bos = new BufferedOutputStream(fos); OutputStreamWriter out; if (encoding == null) { out = new OutputStreamWriter(bos); } else { out = new OutputStreamWriter(bos, encoding); } out.write(docXMLString); out.close(); System.out.println("done"); } // for each file System.out.println("All done"); } // void main(String[] args)
public void close() { if (GATEapplication != null) Factory.deleteResource(GATEapplication); }
// Process an input document with GATE and a Reporter public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); AnnotationSet annots = null; if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations(); else annots = gatedocument.getAnnotations(filters.getAnnotationSetName()); // enrich the input doc with the annotations from // the GATE application // transfer the annotations from the GATE document // to the Behemoth one using the filters List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots, inputDoc); // sort the annotations before adding them? Collections.sort(beheannotations); // clear the existing behemoth annotations if (clearBehemothAnnotations) { inputDoc.getAnnotations().clear(); } inputDoc.getAnnotations().addAll(beheannotations); // add counters about num of annotations added if (reporter != null) for (com.digitalpebble.behemoth.Annotation annot : beheannotations) { reporter.incrCounter("GATE", annot.getType(), 1); } // Add the document features from GATE to Behemoth Set<String> docFeatFilter = this.filters.getDocFeaturesFilter(); MapWritable beheMD = inputDoc.getMetadata(true); if (docFeatFilter.size() > 0) { for (String docFeatName : docFeatFilter) { Object featValue = gatedocument.getFeatures().get(docFeatName); if (featValue != null) { beheMD.put(new Text(docFeatName), new Text(featValue.toString())); } } } if (reporter != null) reporter.incrCounter("GATE", "Document", 1); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } // currently returns only the input document return new BehemothDocument[] {inputDoc}; }
private RetObj ProcessRecords() throws Exception { // Create a Corpus to use. We recycle the same Corpus object for each // iteration. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); this.application.setCorpus(corpus); // object for returned data List<String> processedlines = new ArrayList<String>(); List<String> processedText = new ArrayList<String>(); for (int record_num = 0; record_num < this.recs.size(); ++record_num) { /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0) System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete."); */ // first, split title from body and get embedded age in title.. String title_age = "-1"; String sep = "..THIS IS MY SEPARATION STRING.."; String title = ""; String body = this.recs.get(record_num); Boolean trimmed = false; int age_end = body.indexOf(",> "); if (age_end >= 0 && age_end < body.length()) { int age_start = body.lastIndexOf("-", age_end); if (age_start >= 0 && age_start < age_end) { title_age = body.substring(age_start + 1, age_end).trim(); if (!isInteger(title_age)) title_age = "-1"; else { title = body.substring(0, age_start); body = body.substring(age_end + 2, body.length()); body = title + sep + body; trimmed = true; } } if (!trimmed) { title = body.substring(0, age_end); body = body.substring(age_end + 2, body.length()); body = title + sep + body; trimmed = true; } } // -------------------- org.jsoup.nodes.Document htmldoc = Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ",")); Elements links = htmldoc.select("a[href]"); Elements media = htmldoc.select("[src]"); Elements imports = htmldoc.select("link[href]"); processedText.add(htmldoc.text().replace(sep, " ")); Document doc = Factory.newDocument(htmldoc.text()); // put the document in the corpus corpus.add(doc); // run the application this.application.execute(); // remove the document from the corpus again corpus.clear(); // extract annotations String line = ""; AnnotationSet Annots = doc.getAnnotations(""); Integer FirstPersonCount = 0, ThirdPersonCount = 0; AnnotationSet FirstPerson = Annots.get("FirstPerson"); if (FirstPerson != null) FirstPersonCount = FirstPerson.size(); AnnotationSet ThirdPerson = Annots.get("ThirdPerson"); if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size(); line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ","; AnnotationSet Names = Annots.get("Name"); if (Names == null || Names.size() < 1) line += ","; else { Iterator<Annotation> Iter = Names.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("name"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Age = Annots.get("Age"); if (Age == null || Age.size() < 1) line += title_age + ","; else { Iterator<Annotation> Iter = Age.inDocumentOrder().iterator(); line += title_age + ";"; while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("age"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Cost = Annots.get("Cost"); if (Cost == null || Cost.size() < 1) line += ","; else { Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("value"); if (Feat != null) line += Feat.toString(); else line += "none"; line += "/"; Feat = Ann.getFeatures().get("target_value"); if (Feat != null) line += Feat.toString(); else line += "none"; line += "/"; Feat = Ann.getFeatures().get("target_type"); if (Feat != null) line += Feat.toString(); else line += "none"; if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet height = Annots.get("height"); if (height == null || height.size() < 1) line += ",,"; else { String ft = ""; String inch = ""; Iterator<Annotation> Iter = height.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("feet"); if (Feat != null) ft += Feat.toString(); else ft += "none"; Feat = Ann.getFeatures().get("inches"); if (Feat != null) inch += Feat.toString(); else inch += "none"; if (Iter.hasNext()) { ft += ";"; inch += ";"; } } line += ft + "," + inch + ","; } AnnotationSet weight = Annots.get("weight"); if (weight == null || weight.size() < 1) line += ","; else { Iterator<Annotation> Iter = weight.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("pounds"); if (Feat != null) line += Feat.toString(); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet measurement = Annots.get("measurement"); if (measurement == null || measurement.size() < 1) line += ",,,,"; else { String cup = ""; String chest = ""; String waist = ""; String hip = ""; Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("cup"); if (Feat != null) cup += Feat.toString(); else cup += "none"; Feat = Ann.getFeatures().get("chest"); if (Feat != null) chest += Feat.toString(); else chest += "none"; Feat = Ann.getFeatures().get("waist"); if (Feat != null) waist += Feat.toString(); else waist += "none"; Feat = Ann.getFeatures().get("hip"); if (Feat != null) hip += Feat.toString(); else hip += "none"; if (Iter.hasNext()) { cup += ";"; chest += ";"; waist += ";"; hip += ";"; } } line += cup + "," + chest + "," + waist + "," + hip + ","; } AnnotationSet Ethnicity = Annots.get("Ethnicity"); if (Ethnicity == null || Ethnicity.size() < 1) line += ","; else { Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("ethnicity"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet SkinColor = Annots.get("SkinColor"); if (SkinColor == null || SkinColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet EyeColor = Annots.get("EyeColor"); if (EyeColor == null || EyeColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet HairColor = Annots.get("HairColor"); if (HairColor == null || HairColor.size() < 1) line += ","; else { Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("color"); if (Feat != null) line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); if (Iter.hasNext()) line += ";"; } line += ","; } AnnotationSet Restriction = Annots.get("Restriction"); if (Restriction == null || Restriction.size() < 1) line += ",,,"; else { String type = ""; String ethnicity = ""; String age = ""; Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("type"); if (Feat != null) type += Feat.toString(); else type += "none"; Feat = Ann.getFeatures().get("ethnicity"); if (Feat != null) ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); else ethnicity += "none"; Feat = Ann.getFeatures().get("age"); if (Feat != null) age += Feat.toString(); else age += "none"; if (Iter.hasNext()) { type += ";"; ethnicity += ";"; age += ";"; } } line += type + "," + ethnicity + "," + age + ","; } AnnotationSet Phone = Annots.get("PhoneNumber"); if (Phone == null || Phone.size() < 1) line += ",,,"; else { String value = ""; String state = ""; String city = ""; Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("value"); if (Feat != null) value += Feat.toString(); else value += "none"; Feat = Ann.getFeatures().get("state"); if (Feat != null) state += Feat.toString(); else state += "none"; Feat = Ann.getFeatures().get("area"); if (Feat != null) city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " "); else city += "none"; if (Iter.hasNext()) { value += ";"; state += ";"; city += ";"; } } line += value + "," + state + "," + city + ","; } String Emails = ""; AnnotationSet Email = Annots.get("Email"); if (Email == null || Email.size() < 1) Emails = ""; else { Iterator<Annotation> Iter = Email.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("email"); if (Feat != null) Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (links != null) { for (Element l : links) { String href = l.attr("abs:href"); if (href == null) continue; if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) { Emails += href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";"; } } } if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";")) Emails = Emails.substring(0, Emails.length() - 1); line += Emails + ","; String Urls = ""; AnnotationSet Url = Annots.get("Url"); if (Url == null || Url.size() < 1) Urls = ""; else { Iterator<Annotation> Iter = Url.inDocumentOrder().iterator(); while (Iter.hasNext()) { Annotation Ann = Iter.next(); Object Feat = Ann.getFeatures().get("url"); if (Feat != null) Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (links != null) { for (Element l : links) { String href = l.attr("abs:href"); if (href == null) continue; if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) { Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } } if (imports != null) { for (Element l : imports) { String href = l.attr("abs:href"); if (href == null) continue; Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";")) Urls = Urls.substring(0, Urls.length() - 1); line += Urls + ","; String Medias = ""; if (media != null) { for (Element l : media) { String src = l.attr("abs:src"); if (src == null) continue; Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";"; } } if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";")) Medias = Medias.substring(0, Medias.length() - 1); line += Medias; processedlines.add(line); // Release the document, as it is no longer needed Factory.deleteResource(doc); } Factory.deleteResource(corpus); RetObj out = new RetObj(processedlines, processedText); return out; }
public static void main(String[] args) throws Exception { GateUtils.initGate(); ie = new SpcInteractionExport( "SpcInteractionExport/SpcInteractionExport_" + ProjectSetup.makeTimeStamp() + ".csv"); /**/ initController(); /**/ CsvReader in = new CsvReader("C:/Users/dedek/Desktop/DATLOWE/LP_SPC.csv", ';', Charset.forName("cp1250")); in.readHeaders(); StopRequestDetector srd = new StopRequestDetector(); srd.startDetector(); int num = 0; while (!srd.stop_requested) { if (!in.readRecord()) break; String spcCode = in.get(0); ie.setSpcCode(spcCode); String doc = in.get(3); String pdf = in.get(4); String fileStr = pdf; File file = null; if (!fileStr.isEmpty()) { file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr); if (!file.exists()) fileStr = ""; } if (fileStr.isEmpty()) { fileStr = doc; } if (!fileStr.isEmpty()) { file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr); if (!file.exists()) fileStr = ""; } System.err.format( "%4d %s record: %s %s %s\n", ++num, ProjectSetup.makeTimeStamp(), spcCode, in.get(1), fileStr); if (fileStr.isEmpty()) { System.err.format("WARNING ommiting record: %s %s\n", in.get(0), in.get(1)); continue; } Document gateDoc = Factory.newDocument(file.toURI().toURL(), "utf8"); corpus.add(gateDoc); try { controller.execute(); } catch (ExecutionException e) { System.err.println("----------------------- EXECUTION INTERUPTED -------------------"); e.printStackTrace(); initController(); System.err.println("----------------------- EXECUTION RESTARTED -------------------"); } corpus.clear(); // GateUtils.saveGateDocumentToXML(gateDoc, // "C:/Users/dedek/Desktop/DATLOWE/SPC_ananlyzed/"+spcCode+".xml"); Factory.deleteResource(gateDoc); } in.close(); ie.close(); GateUtils.deleteAllPublicGateResources(); }