Example #1
0
  /**
   * Starts the needed worker threads.
   *
   * <p>if AbstractFile module is still running, do nothing and allow it to consume queue otherwise
   * start /restart AbstractFile worker
   *
   * <p>data source ingest workers run per (module,content). Checks if one for the same
   * (module,content) is already running otherwise start/restart the worker
   */
  private synchronized void startAll() {
    final IngestScheduler.DataSourceScheduler dataSourceScheduler =
        scheduler.getDataSourceScheduler();
    final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();

    logger.log(Level.INFO, "DataSource queue: " + dataSourceScheduler.toString());
    logger.log(Level.INFO, "File queue: " + fileScheduler.toString());

    if (!ingestMonitor.isRunning()) {
      ingestMonitor.start();
    }

    // image ingesters
    // cycle through each data source content in the queue
    while (dataSourceScheduler.hasNext()) {
      // dequeue
      // get next data source content and set of modules
      final ScheduledTask<IngestModuleDataSource> dataSourceTask = dataSourceScheduler.next();

      // check if each module for this data source content is already running
      for (IngestModuleDataSource taskModule : dataSourceTask.getModules()) {
        boolean alreadyRunning = false;
        for (IngestDataSourceThread worker : dataSourceIngesters) {
          // ignore threads that are on different data sources
          if (!worker.getContent().equals(dataSourceTask.getContent())) {
            continue; // check next worker
          }
          // same data source, check module (by name, not id, since different instances)
          if (worker.getModule().getName().equals(taskModule.getName())) {
            alreadyRunning = true;
            logger.log(
                Level.INFO,
                "Data Source Ingester <"
                    + dataSourceTask.getContent()
                    + ", "
                    + taskModule.getName()
                    + "> is already running");
            break;
          }
        }
        // checked all workers
        if (alreadyRunning == false) {
          logger.log(
              Level.INFO,
              "Starting new data source Ingester <"
                  + dataSourceTask.getContent()
                  + ", "
                  + taskModule.getName()
                  + ">");
          // data source modules are now initialized per instance

          IngestModuleInit moduleInit = new IngestModuleInit();

          PipelineContext<IngestModuleDataSource> dataSourcepipelineContext =
              new PipelineContext<IngestModuleDataSource>(dataSourceTask, getProcessUnallocSpace());
          final IngestDataSourceThread newDataSourceWorker =
              new IngestDataSourceThread(
                  this,
                  dataSourcepipelineContext,
                  dataSourceTask.getContent(),
                  taskModule,
                  moduleInit);

          dataSourceIngesters.add(newDataSourceWorker);

          // wrap the module in a worker, that will run init, process and complete on the module
          newDataSourceWorker.execute();
          IngestManager.fireModuleEvent(IngestModuleEvent.STARTED.toString(), taskModule.getName());
        }
      }
    }

    // AbstractFile ingester
    boolean startAbstractFileIngester = false;
    if (fileScheduler.hasNext()) {
      if (abstractFileIngester == null) {
        startAbstractFileIngester = true;
        logger.log(Level.INFO, "Starting initial AbstractFile ingester");
      }
      // if worker had completed, restart it in case data is still enqueued
      else if (abstractFileIngester.isDone()) {
        startAbstractFileIngester = true;
        logger.log(Level.INFO, "Restarting AbstractFile ingester");
      }
    } else {
      logger.log(Level.INFO, "no new AbstractFile enqueued, no ingester needed");
    }

    if (startAbstractFileIngester) {
      stats = new IngestManagerStats();
      abstractFileIngester = new IngestAbstractFileProcessor();
      // init all fs modules, everytime new worker starts
      /* @@@ I don't understand why we do an init on each module.  Should do only modules
       * that we are going to be using in the pipeline
       */
      for (IngestModuleAbstractFile s : abstractFileModules) {
        IngestModuleInit moduleInit = new IngestModuleInit();
        try {
          s.init(moduleInit);
        } catch (Exception e) {
          logger.log(Level.SEVERE, "File ingest module failed init(): " + s.getName());
        }
      }
      abstractFileIngester.execute();
    }
  }
Example #2
0
    private void queueAll(List<IngestModuleAbstract> modules, final List<Content> inputs) {

      int processed = 0;
      for (Content input : inputs) {
        final String inputName = input.getName();

        final List<IngestModuleDataSource> dataSourceMods = new ArrayList<IngestModuleDataSource>();
        final List<IngestModuleAbstractFile> fileMods = new ArrayList<IngestModuleAbstractFile>();

        for (IngestModuleAbstract module : modules) {
          if (isCancelled()) {
            logger.log(Level.INFO, "Terminating ingest queueing due to cancellation.");
            return;
          }

          final String moduleName = module.getName();
          progress.progress(moduleName + " " + inputName, processed);

          switch (module.getType()) {
            case DataSource:
              final IngestModuleDataSource newModuleInstance =
                  (IngestModuleDataSource) moduleLoader.getNewIngestModuleInstance(module);
              if (newModuleInstance != null) {
                dataSourceMods.add(newModuleInstance);
              } else {
                logger.log(
                    Level.INFO,
                    "Error loading module and adding input "
                        + inputName
                        + " with module "
                        + module.getName());
              }
              break;

            case AbstractFile:
              // enqueue the same singleton AbstractFile module
              logger.log(
                  Level.INFO,
                  "Adding input " + inputName + " for AbstractFileModule " + module.getName());

              fileMods.add((IngestModuleAbstractFile) module);
              break;

            default:
              logger.log(Level.SEVERE, "Unexpected module type: " + module.getType().name());
          }
        } // for modules

        // queue to schedulers

        // queue to datasource-level ingest pipeline(s)
        final boolean processUnalloc = getProcessUnallocSpace();
        final ScheduledTask<IngestModuleDataSource> dataSourceTask =
            new ScheduledTask<IngestModuleDataSource>(input, dataSourceMods);
        final PipelineContext<IngestModuleDataSource> dataSourcePipelineContext =
            new PipelineContext<IngestModuleDataSource>(dataSourceTask, processUnalloc);
        logger.log(Level.INFO, "Queing data source ingest task: " + dataSourceTask);
        progress.progress("DataSource Ingest" + " " + inputName, processed);
        final IngestScheduler.DataSourceScheduler dataSourceScheduler =
            scheduler.getDataSourceScheduler();
        dataSourceScheduler.schedule(dataSourcePipelineContext);
        progress.progress("DataSource Ingest" + " " + inputName, ++processed);

        // queue to file-level ingest pipeline
        final ScheduledTask<IngestModuleAbstractFile> fTask = new ScheduledTask(input, fileMods);
        final PipelineContext<IngestModuleAbstractFile> filepipelineContext =
            new PipelineContext<IngestModuleAbstractFile>(fTask, processUnalloc);
        logger.log(Level.INFO, "Queing file ingest task: " + fTask);
        progress.progress("File Ingest" + " " + inputName, processed);
        final IngestScheduler.FileScheduler fileScheduler = scheduler.getFileScheduler();
        fileScheduler.schedule(filepipelineContext);
        progress.progress("File Ingest" + " " + inputName, ++processed);
      } // for data sources

      // logger.log(Level.INFO, AbstractFileQueue.printQueue());
    }