Ejemplo n.º 1
0
  protected JobOutput executeJellyfishMerger(
      Args args, String ecqName, Set<File> fileSet, File outputDir)
      throws InterruptedException, ProcessExecutionException, ConanParameterException, IOException {

    String suffix = "jellyfish_" + ecqName + "_all.jf31_0";

    String jobName = args.getJobPrefix() + "-merge-" + suffix;

    List<File> files = new ArrayList<>();
    files.addAll(fileSet);

    File outputFile = new File(outputDir, suffix);

    JellyfishMergeV11 jellyfishMerge =
        this.makeJellyfishMerge(files, outputFile, args.getOrganism());

    ExecutionResult id =
        this.conanExecutorService.executeProcess(
            jellyfishMerge,
            args.getOutputDir(),
            jobName,
            args.getThreadsPerProcess(),
            args.getMemoryPerProcess(),
            args.isRunParallel());

    id.setName("merge-" + suffix);

    return new JobOutput(id, outputFile);
  }
Ejemplo n.º 2
0
  @Override
  public ExecutionResult execute(ExecutionContext executionContext)
      throws ProcessExecutionException, InterruptedException {

    try {

      StopWatch stopWatch = new StopWatch();
      stopWatch.start();

      log.info("Starting Kmer Counting on all Reads");

      // Create shortcut to args for convienience
      Args args = this.getArgs();

      // Force run parallel to false if not using a scheduler
      if (!executionContext.usingScheduler() && args.isRunParallel()) {
        log.warn("Forcing linear execution due to lack of job scheduler");
        args.setRunParallel(false);
      }

      // Create the output directory
      args.getOutputDir().mkdirs();

      JobOutputMap jfCountOutputs = new JobOutputMap();
      List<ExecutionResult> jobResults = new ArrayList<>();
      List<ExecutionResult> allJobResults = new ArrayList<>();

      // Create the output directory for the RAW datasets
      File rawOutputDir = new File(args.getOutputDir(), "raw");

      if (!rawOutputDir.exists()) {
        rawOutputDir.mkdirs();
      }

      // Start jellyfish on all RAW datasets
      for (Library lib : args.getAllLibraries()) {

        // Execute jellyfish and add id to list of job ids
        JobOutput jfOut = this.executeJellyfishCount(args, "raw", args.getOutputDir(), lib);
        jobResults.add(jfOut.getResult());
        allJobResults.add(jfOut.getResult());
        jfCountOutputs.updateTracker("raw", jfOut.getOutputFile());
      }

      // Also start jellyfish on all the prep-processed libraries from MECQ
      if (args.getAllMecqs() != null) {
        for (Mecq.EcqArgs ecqArgs : args.getAllMecqs()) {

          // Create the output directory for the RAW datasets
          File ecqOutputDir = new File(args.getOutputDir(), ecqArgs.getName());

          if (!ecqOutputDir.exists()) {
            ecqOutputDir.mkdirs();
          }

          for (Library lib : ecqArgs.getOutputLibraries()) {

            // Add jellyfish id to list of job ids
            JobOutput jfOut =
                this.executeJellyfishCount(args, ecqArgs.getName(), args.getOutputDir(), lib);

            jobResults.add(jfOut.getResult());
            allJobResults.add(jfOut.getResult());
            jfCountOutputs.updateTracker(ecqArgs.getName(), jfOut.getOutputFile());
          }
        }
      }

      // If we're using a scheduler and we have been asked to run each job
      // in parallel, then we should wait for all those to complete before continueing.
      if (executionContext.usingScheduler() && args.isRunParallel()) {
        log.info("Kmer counting all ECQ groups in parallel, waiting for completion");
        this.conanExecutorService.executeScheduledWait(
            jobResults,
            args.getJobPrefix() + "-count-*",
            ExitStatus.Type.COMPLETED_ANY,
            args.getJobPrefix() + "-kmer-count-wait",
            args.getOutputDir());
      }

      // Waiting point... clear job ids.
      jobResults.clear();

      JobOutputMap mergedOutputs = new JobOutputMap();

      // Now execute merge jobs if required
      for (Map.Entry<String, Set<File>> entry : jfCountOutputs.entrySet()) {

        String ecqName = entry.getKey();
        Set<File> fileSet = entry.getValue();

        // Only merge if there's more than one library
        if (fileSet.size() > 1) {
          JobOutput jfOut =
              this.executeJellyfishMerger(
                  args, ecqName, fileSet, new File(args.getOutputDir(), ecqName));

          jobResults.add(jfOut.getResult());
          allJobResults.add(jfOut.getResult());
          mergedOutputs.updateTracker(ecqName, jfOut.getOutputFile());
        }
      }

      // If we're using a scheduler and we have been asked to run each job
      // in parallel, then we should wait for all those to complete before continueing.
      if (executionContext.usingScheduler() && args.isRunParallel()) {
        log.info(
            "Creating merged kmer counts for all ECQ groups in parallel, waiting for completion");
        this.conanExecutorService.executeScheduledWait(
            jobResults,
            args.getJobPrefix() + "-merge-*",
            ExitStatus.Type.COMPLETED_ANY,
            args.getJobPrefix() + "-kmer-merge-wait",
            args.getOutputDir());
      }

      // Waiting point... clear job ids.
      jobResults.clear();

      // Combine all jellyfish out maps
      jfCountOutputs.combine(mergedOutputs);

      String katGcpJobPrefix = args.getJobPrefix() + "-kat-gcp";

      // Run KAT GCP on everything
      List<ExecutionResult> katGcpResults =
          this.executeKatGcp(
              jfCountOutputs,
              katGcpJobPrefix,
              args.getThreadsPerProcess(),
              args.getMemoryPerProcess(),
              args.isRunParallel());

      for (ExecutionResult result : katGcpResults) {
        result.setName(result.getName().substring(args.getJobPrefix().length() + 1));
        jobResults.add(result);
        allJobResults.add(result);
      }

      // If we're using a scheduler and we have been asked to run each job
      // in parallel, then we should wait for all those to complete before continueing.
      if (executionContext.usingScheduler() && args.isRunParallel()) {
        log.info("Running \"kat gcp\" for all ECQ groups in parallel, waiting for completion");
        this.conanExecutorService.executeScheduledWait(
            jobResults,
            katGcpJobPrefix + "*",
            ExitStatus.Type.COMPLETED_ANY,
            args.getJobPrefix() + "-kat-gcp-wait",
            args.getOutputDir());
      }

      // Waiting point... clear job ids.
      jobResults.clear();

      log.info("Kmer counting of all reads finished.");

      stopWatch.stop();

      TaskResult taskResult =
          new DefaultTaskResult(
              "rampart-read_analysis-kmer", true, allJobResults, stopWatch.getTime() / 1000L);

      // Output the resource usage to file
      FileUtils.writeLines(
          new File(args.getOutputDir(), args.getJobPrefix() + ".summary"), taskResult.getOutput());

      return new DefaultExecutionResult(
          taskResult.getTaskName(),
          0,
          new String[] {},
          null,
          -1,
          new ResourceUsage(
              taskResult.getMaxMemUsage(),
              taskResult.getActualTotalRuntime(),
              taskResult.getTotalExternalCputime()));
    } catch (ConanParameterException | IOException e) {
      throw new ProcessExecutionException(-1, e);
    }
  }