/** * check if the module is running (was started and not yet complete/stopped) give a complete * answer, i.e. it's already consumed all files but it might have background threads running */ public boolean isModuleRunning(final IngestModuleAbstract module) { if (module.getType() == IngestModuleAbstract.ModuleType.AbstractFile) { IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler(); if (fileScheduler.hasModuleEnqueued((IngestModuleAbstractFile) module)) { // has work enqueued, so running return true; } else { // not in the queue, but could still have bkg work running return module.hasBackgroundJobsRunning(); } } else { // data source module synchronized (this) { if (dataSourceIngesters.isEmpty()) { return false; } IngestDataSourceThread imt = null; for (IngestDataSourceThread ii : dataSourceIngesters) { if (ii.getModule().equals(module)) { imt = ii; break; } } if (imt == null) { return false; } if (imt.isDone() == false) { return true; } else { return false; } } } }
@Override protected Object doInBackground() throws Exception { logger.log(Level.INFO, "Starting background ingest file processor"); logger.log(Level.INFO, PlatformUtil.getAllMemUsageInfo()); stats.start(); // notify main thread modules started for (IngestModuleAbstractFile s : abstractFileModules) { IngestManager.fireModuleEvent(IngestModuleEvent.STARTED.toString(), s.getName()); } final String displayName = "File Ingest"; progress = ProgressHandleFactory.createHandle( displayName, new Cancellable() { @Override public boolean cancel() { logger.log(Level.INFO, "Filed ingest cancelled by user."); if (progress != null) { progress.setDisplayName(displayName + " (Cancelling...)"); } return IngestAbstractFileProcessor.this.cancel(true); } }); final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler(); // initialize the progress bar progress.start(); progress.switchToIndeterminate(); // set initial totals and processed (to be updated as we process or new files are scheduled) int totalEnqueuedFiles = fileScheduler.getFilesEnqueuedEst(); progress.switchToDeterminate(totalEnqueuedFiles); int processedFiles = 0; // process AbstractFiles queue while (fileScheduler.hasNext()) { final ProcessTask fileTask = fileScheduler.next(); final PipelineContext<IngestModuleAbstractFile> filepipelineContext = fileTask.context; final ScheduledTask<IngestModuleAbstractFile> fileIngestTask = filepipelineContext.getScheduledTask(); final AbstractFile fileToProcess = fileTask.file; // clear return values from modules for last file synchronized (abstractFileModulesRetValues) { abstractFileModulesRetValues.clear(); } // logger.log(Level.INFO, "IngestManager: Processing: {0}", fileToProcess.getName()); for (IngestModuleAbstractFile module : fileIngestTask.getModules()) { // process the file with every file module if (isCancelled()) { logger.log(Level.INFO, "Terminating file ingest due to cancellation."); return null; } progress.progress( fileToProcess.getName() + " (" + module.getName() + ")", processedFiles); try { stats.logFileModuleStartProcess(module); IngestModuleAbstractFile.ProcessResult result = module.process(filepipelineContext, fileToProcess); stats.logFileModuleEndProcess(module); // store the result for subsequent modules for this file synchronized (abstractFileModulesRetValues) { abstractFileModulesRetValues.put(module.getName(), result); } } catch (Exception e) { logger.log( Level.SEVERE, "Error: unexpected exception from module: " + module.getName(), e); stats.addError(module); } catch (OutOfMemoryError e) { logger.log(Level.SEVERE, "Error: out of memory from module: " + module.getName(), e); stats.addError(module); } } // end for every module // free the internal file resource after done with every module fileToProcess.close(); int newTotalEnqueuedFiles = fileScheduler.getFilesEnqueuedEst(); if (newTotalEnqueuedFiles > totalEnqueuedFiles) { // update if new enqueued totalEnqueuedFiles = newTotalEnqueuedFiles + 1; // + processedFiles + 1; // processedFiles = 0; // reset progress.switchToIndeterminate(); progress.switchToDeterminate(totalEnqueuedFiles); } if (processedFiles < totalEnqueuedFiles) { // fix for now to handle the same datasource Content enqueued // twice ++processedFiles; } // --totalEnqueuedFiles; } // end of for every AbstractFile logger.log(Level.INFO, "IngestManager: Finished processing files"); return null; }
/** * Starts the needed worker threads. * * <p>if AbstractFile module is still running, do nothing and allow it to consume queue otherwise * start /restart AbstractFile worker * * <p>data source ingest workers run per (module,content). Checks if one for the same * (module,content) is already running otherwise start/restart the worker */ private synchronized void startAll() { final IngestScheduler.DataSourceScheduler dataSourceScheduler = scheduler.getDataSourceScheduler(); final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler(); logger.log(Level.INFO, "DataSource queue: " + dataSourceScheduler.toString()); logger.log(Level.INFO, "File queue: " + fileScheduler.toString()); if (!ingestMonitor.isRunning()) { ingestMonitor.start(); } // image ingesters // cycle through each data source content in the queue while (dataSourceScheduler.hasNext()) { // dequeue // get next data source content and set of modules final ScheduledTask<IngestModuleDataSource> dataSourceTask = dataSourceScheduler.next(); // check if each module for this data source content is already running for (IngestModuleDataSource taskModule : dataSourceTask.getModules()) { boolean alreadyRunning = false; for (IngestDataSourceThread worker : dataSourceIngesters) { // ignore threads that are on different data sources if (!worker.getContent().equals(dataSourceTask.getContent())) { continue; // check next worker } // same data source, check module (by name, not id, since different instances) if (worker.getModule().getName().equals(taskModule.getName())) { alreadyRunning = true; logger.log( Level.INFO, "Data Source Ingester <" + dataSourceTask.getContent() + ", " + taskModule.getName() + "> is already running"); break; } } // checked all workers if (alreadyRunning == false) { logger.log( Level.INFO, "Starting new data source Ingester <" + dataSourceTask.getContent() + ", " + taskModule.getName() + ">"); // data source modules are now initialized per instance IngestModuleInit moduleInit = new IngestModuleInit(); PipelineContext<IngestModuleDataSource> dataSourcepipelineContext = new PipelineContext<IngestModuleDataSource>(dataSourceTask, getProcessUnallocSpace()); final IngestDataSourceThread newDataSourceWorker = new IngestDataSourceThread( this, dataSourcepipelineContext, dataSourceTask.getContent(), taskModule, moduleInit); dataSourceIngesters.add(newDataSourceWorker); // wrap the module in a worker, that will run init, process and complete on the module newDataSourceWorker.execute(); IngestManager.fireModuleEvent(IngestModuleEvent.STARTED.toString(), taskModule.getName()); } } } // AbstractFile ingester boolean startAbstractFileIngester = false; if (fileScheduler.hasNext()) { if (abstractFileIngester == null) { startAbstractFileIngester = true; logger.log(Level.INFO, "Starting initial AbstractFile ingester"); } // if worker had completed, restart it in case data is still enqueued else if (abstractFileIngester.isDone()) { startAbstractFileIngester = true; logger.log(Level.INFO, "Restarting AbstractFile ingester"); } } else { logger.log(Level.INFO, "no new AbstractFile enqueued, no ingester needed"); } if (startAbstractFileIngester) { stats = new IngestManagerStats(); abstractFileIngester = new IngestAbstractFileProcessor(); // init all fs modules, everytime new worker starts /* @@@ I don't understand why we do an init on each module. Should do only modules * that we are going to be using in the pipeline */ for (IngestModuleAbstractFile s : abstractFileModules) { IngestModuleInit moduleInit = new IngestModuleInit(); try { s.init(moduleInit); } catch (Exception e) { logger.log(Level.SEVERE, "File ingest module failed init(): " + s.getName()); } } abstractFileIngester.execute(); } }
private void queueAll(List<IngestModuleAbstract> modules, final List<Content> inputs) { int processed = 0; for (Content input : inputs) { final String inputName = input.getName(); final List<IngestModuleDataSource> dataSourceMods = new ArrayList<IngestModuleDataSource>(); final List<IngestModuleAbstractFile> fileMods = new ArrayList<IngestModuleAbstractFile>(); for (IngestModuleAbstract module : modules) { if (isCancelled()) { logger.log(Level.INFO, "Terminating ingest queueing due to cancellation."); return; } final String moduleName = module.getName(); progress.progress(moduleName + " " + inputName, processed); switch (module.getType()) { case DataSource: final IngestModuleDataSource newModuleInstance = (IngestModuleDataSource) moduleLoader.getNewIngestModuleInstance(module); if (newModuleInstance != null) { dataSourceMods.add(newModuleInstance); } else { logger.log( Level.INFO, "Error loading module and adding input " + inputName + " with module " + module.getName()); } break; case AbstractFile: // enqueue the same singleton AbstractFile module logger.log( Level.INFO, "Adding input " + inputName + " for AbstractFileModule " + module.getName()); fileMods.add((IngestModuleAbstractFile) module); break; default: logger.log(Level.SEVERE, "Unexpected module type: " + module.getType().name()); } } // for modules // queue to schedulers // queue to datasource-level ingest pipeline(s) final boolean processUnalloc = getProcessUnallocSpace(); final ScheduledTask<IngestModuleDataSource> dataSourceTask = new ScheduledTask<IngestModuleDataSource>(input, dataSourceMods); final PipelineContext<IngestModuleDataSource> dataSourcePipelineContext = new PipelineContext<IngestModuleDataSource>(dataSourceTask, processUnalloc); logger.log(Level.INFO, "Queing data source ingest task: " + dataSourceTask); progress.progress("DataSource Ingest" + " " + inputName, processed); final IngestScheduler.DataSourceScheduler dataSourceScheduler = scheduler.getDataSourceScheduler(); dataSourceScheduler.schedule(dataSourcePipelineContext); progress.progress("DataSource Ingest" + " " + inputName, ++processed); // queue to file-level ingest pipeline final ScheduledTask<IngestModuleAbstractFile> fTask = new ScheduledTask(input, fileMods); final PipelineContext<IngestModuleAbstractFile> filepipelineContext = new PipelineContext<IngestModuleAbstractFile>(fTask, processUnalloc); logger.log(Level.INFO, "Queing file ingest task: " + fTask); progress.progress("File Ingest" + " " + inputName, processed); final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler(); fileScheduler.schedule(filepipelineContext); progress.progress("File Ingest" + " " + inputName, ++processed); } // for data sources // logger.log(Level.INFO, AbstractFileQueue.printQueue()); }