Ejemplo n.º 1
0
  /**
   * check if the module is running (was started and not yet complete/stopped) give a complete
   * answer, i.e. it's already consumed all files but it might have background threads running
   */
  public boolean isModuleRunning(final IngestModuleAbstract module) {

    if (module.getType() == IngestModuleAbstract.ModuleType.AbstractFile) {
      IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();

      if (fileScheduler.hasModuleEnqueued((IngestModuleAbstractFile) module)) {
        // has work enqueued, so running
        return true;
      } else {
        // not in the queue, but could still have bkg work running
        return module.hasBackgroundJobsRunning();
      }

    } else {
      // data source module
      synchronized (this) {
        if (dataSourceIngesters.isEmpty()) {
          return false;
        }
        IngestDataSourceThread imt = null;
        for (IngestDataSourceThread ii : dataSourceIngesters) {
          if (ii.getModule().equals(module)) {
            imt = ii;
            break;
          }
        }

        if (imt == null) {
          return false;
        }

        if (imt.isDone() == false) {
          return true;
        } else {
          return false;
        }
      }
    }
  }
Ejemplo n.º 2
0
    @Override
    protected Object doInBackground() throws Exception {

      logger.log(Level.INFO, "Starting background ingest file processor");
      logger.log(Level.INFO, PlatformUtil.getAllMemUsageInfo());

      stats.start();

      // notify main thread modules started
      for (IngestModuleAbstractFile s : abstractFileModules) {
        IngestManager.fireModuleEvent(IngestModuleEvent.STARTED.toString(), s.getName());
      }

      final String displayName = "File Ingest";
      progress =
          ProgressHandleFactory.createHandle(
              displayName,
              new Cancellable() {
                @Override
                public boolean cancel() {
                  logger.log(Level.INFO, "Filed ingest cancelled by user.");
                  if (progress != null) {
                    progress.setDisplayName(displayName + " (Cancelling...)");
                  }
                  return IngestAbstractFileProcessor.this.cancel(true);
                }
              });

      final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();

      // initialize the progress bar
      progress.start();
      progress.switchToIndeterminate();
      // set initial totals and processed (to be updated as we process or new files are scheduled)
      int totalEnqueuedFiles = fileScheduler.getFilesEnqueuedEst();
      progress.switchToDeterminate(totalEnqueuedFiles);
      int processedFiles = 0;
      // process AbstractFiles queue
      while (fileScheduler.hasNext()) {
        final ProcessTask fileTask = fileScheduler.next();
        final PipelineContext<IngestModuleAbstractFile> filepipelineContext = fileTask.context;
        final ScheduledTask<IngestModuleAbstractFile> fileIngestTask =
            filepipelineContext.getScheduledTask();
        final AbstractFile fileToProcess = fileTask.file;

        // clear return values from modules for last file
        synchronized (abstractFileModulesRetValues) {
          abstractFileModulesRetValues.clear();
        }

        // logger.log(Level.INFO, "IngestManager: Processing: {0}", fileToProcess.getName());

        for (IngestModuleAbstractFile module : fileIngestTask.getModules()) {
          // process the file with every file module
          if (isCancelled()) {
            logger.log(Level.INFO, "Terminating file ingest due to cancellation.");
            return null;
          }
          progress.progress(
              fileToProcess.getName() + " (" + module.getName() + ")", processedFiles);

          try {
            stats.logFileModuleStartProcess(module);
            IngestModuleAbstractFile.ProcessResult result =
                module.process(filepipelineContext, fileToProcess);
            stats.logFileModuleEndProcess(module);

            // store the result for subsequent modules for this file
            synchronized (abstractFileModulesRetValues) {
              abstractFileModulesRetValues.put(module.getName(), result);
            }

          } catch (Exception e) {
            logger.log(
                Level.SEVERE, "Error: unexpected exception from module: " + module.getName(), e);
            stats.addError(module);
          } catch (OutOfMemoryError e) {
            logger.log(Level.SEVERE, "Error: out of memory from module: " + module.getName(), e);
            stats.addError(module);
          }
        } // end for every module

        // free the internal file resource after done with every module
        fileToProcess.close();

        int newTotalEnqueuedFiles = fileScheduler.getFilesEnqueuedEst();
        if (newTotalEnqueuedFiles > totalEnqueuedFiles) {
          // update if new enqueued
          totalEnqueuedFiles = newTotalEnqueuedFiles + 1; // + processedFiles + 1;
          // processedFiles = 0;
          // reset
          progress.switchToIndeterminate();
          progress.switchToDeterminate(totalEnqueuedFiles);
        }
        if (processedFiles
            < totalEnqueuedFiles) { // fix for now to handle the same datasource Content enqueued
                                    // twice
          ++processedFiles;
        }
        // --totalEnqueuedFiles;

      } // end of for every AbstractFile
      logger.log(Level.INFO, "IngestManager: Finished processing files");
      return null;
    }
Ejemplo n.º 3
0
  /**
   * Starts the needed worker threads.
   *
   * <p>if AbstractFile module is still running, do nothing and allow it to consume queue otherwise
   * start /restart AbstractFile worker
   *
   * <p>data source ingest workers run per (module,content). Checks if one for the same
   * (module,content) is already running otherwise start/restart the worker
   */
  private synchronized void startAll() {
    final IngestScheduler.DataSourceScheduler dataSourceScheduler =
        scheduler.getDataSourceScheduler();
    final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();

    logger.log(Level.INFO, "DataSource queue: " + dataSourceScheduler.toString());
    logger.log(Level.INFO, "File queue: " + fileScheduler.toString());

    if (!ingestMonitor.isRunning()) {
      ingestMonitor.start();
    }

    // image ingesters
    // cycle through each data source content in the queue
    while (dataSourceScheduler.hasNext()) {
      // dequeue
      // get next data source content and set of modules
      final ScheduledTask<IngestModuleDataSource> dataSourceTask = dataSourceScheduler.next();

      // check if each module for this data source content is already running
      for (IngestModuleDataSource taskModule : dataSourceTask.getModules()) {
        boolean alreadyRunning = false;
        for (IngestDataSourceThread worker : dataSourceIngesters) {
          // ignore threads that are on different data sources
          if (!worker.getContent().equals(dataSourceTask.getContent())) {
            continue; // check next worker
          }
          // same data source, check module (by name, not id, since different instances)
          if (worker.getModule().getName().equals(taskModule.getName())) {
            alreadyRunning = true;
            logger.log(
                Level.INFO,
                "Data Source Ingester <"
                    + dataSourceTask.getContent()
                    + ", "
                    + taskModule.getName()
                    + "> is already running");
            break;
          }
        }
        // checked all workers
        if (alreadyRunning == false) {
          logger.log(
              Level.INFO,
              "Starting new data source Ingester <"
                  + dataSourceTask.getContent()
                  + ", "
                  + taskModule.getName()
                  + ">");
          // data source modules are now initialized per instance

          IngestModuleInit moduleInit = new IngestModuleInit();

          PipelineContext<IngestModuleDataSource> dataSourcepipelineContext =
              new PipelineContext<IngestModuleDataSource>(dataSourceTask, getProcessUnallocSpace());
          final IngestDataSourceThread newDataSourceWorker =
              new IngestDataSourceThread(
                  this,
                  dataSourcepipelineContext,
                  dataSourceTask.getContent(),
                  taskModule,
                  moduleInit);

          dataSourceIngesters.add(newDataSourceWorker);

          // wrap the module in a worker, that will run init, process and complete on the module
          newDataSourceWorker.execute();
          IngestManager.fireModuleEvent(IngestModuleEvent.STARTED.toString(), taskModule.getName());
        }
      }
    }

    // AbstractFile ingester
    boolean startAbstractFileIngester = false;
    if (fileScheduler.hasNext()) {
      if (abstractFileIngester == null) {
        startAbstractFileIngester = true;
        logger.log(Level.INFO, "Starting initial AbstractFile ingester");
      }
      // if worker had completed, restart it in case data is still enqueued
      else if (abstractFileIngester.isDone()) {
        startAbstractFileIngester = true;
        logger.log(Level.INFO, "Restarting AbstractFile ingester");
      }
    } else {
      logger.log(Level.INFO, "no new AbstractFile enqueued, no ingester needed");
    }

    if (startAbstractFileIngester) {
      stats = new IngestManagerStats();
      abstractFileIngester = new IngestAbstractFileProcessor();
      // init all fs modules, everytime new worker starts
      /* @@@ I don't understand why we do an init on each module.  Should do only modules
       * that we are going to be using in the pipeline
       */
      for (IngestModuleAbstractFile s : abstractFileModules) {
        IngestModuleInit moduleInit = new IngestModuleInit();
        try {
          s.init(moduleInit);
        } catch (Exception e) {
          logger.log(Level.SEVERE, "File ingest module failed init(): " + s.getName());
        }
      }
      abstractFileIngester.execute();
    }
  }
Ejemplo n.º 4
0
    private void queueAll(List<IngestModuleAbstract> modules, final List<Content> inputs) {

      int processed = 0;
      for (Content input : inputs) {
        final String inputName = input.getName();

        final List<IngestModuleDataSource> dataSourceMods = new ArrayList<IngestModuleDataSource>();
        final List<IngestModuleAbstractFile> fileMods = new ArrayList<IngestModuleAbstractFile>();

        for (IngestModuleAbstract module : modules) {
          if (isCancelled()) {
            logger.log(Level.INFO, "Terminating ingest queueing due to cancellation.");
            return;
          }

          final String moduleName = module.getName();
          progress.progress(moduleName + " " + inputName, processed);

          switch (module.getType()) {
            case DataSource:
              final IngestModuleDataSource newModuleInstance =
                  (IngestModuleDataSource) moduleLoader.getNewIngestModuleInstance(module);
              if (newModuleInstance != null) {
                dataSourceMods.add(newModuleInstance);
              } else {
                logger.log(
                    Level.INFO,
                    "Error loading module and adding input "
                        + inputName
                        + " with module "
                        + module.getName());
              }
              break;

            case AbstractFile:
              // enqueue the same singleton AbstractFile module
              logger.log(
                  Level.INFO,
                  "Adding input " + inputName + " for AbstractFileModule " + module.getName());

              fileMods.add((IngestModuleAbstractFile) module);
              break;

            default:
              logger.log(Level.SEVERE, "Unexpected module type: " + module.getType().name());
          }
        } // for modules

        // queue to schedulers

        // queue to datasource-level ingest pipeline(s)
        final boolean processUnalloc = getProcessUnallocSpace();
        final ScheduledTask<IngestModuleDataSource> dataSourceTask =
            new ScheduledTask<IngestModuleDataSource>(input, dataSourceMods);
        final PipelineContext<IngestModuleDataSource> dataSourcePipelineContext =
            new PipelineContext<IngestModuleDataSource>(dataSourceTask, processUnalloc);
        logger.log(Level.INFO, "Queing data source ingest task: " + dataSourceTask);
        progress.progress("DataSource Ingest" + " " + inputName, processed);
        final IngestScheduler.DataSourceScheduler dataSourceScheduler =
            scheduler.getDataSourceScheduler();
        dataSourceScheduler.schedule(dataSourcePipelineContext);
        progress.progress("DataSource Ingest" + " " + inputName, ++processed);

        // queue to file-level ingest pipeline
        final ScheduledTask<IngestModuleAbstractFile> fTask = new ScheduledTask(input, fileMods);
        final PipelineContext<IngestModuleAbstractFile> filepipelineContext =
            new PipelineContext<IngestModuleAbstractFile>(fTask, processUnalloc);
        logger.log(Level.INFO, "Queing file ingest task: " + fTask);
        progress.progress("File Ingest" + " " + inputName, processed);
        final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();
        fileScheduler.schedule(filepipelineContext);
        progress.progress("File Ingest" + " " + inputName, ++processed);
      } // for data sources

      // logger.log(Level.INFO, AbstractFileQueue.printQueue());
    }