protected JobOutput executeJellyfishMerger( Args args, String ecqName, Set<File> fileSet, File outputDir) throws InterruptedException, ProcessExecutionException, ConanParameterException, IOException { String suffix = "jellyfish_" + ecqName + "_all.jf31_0"; String jobName = args.getJobPrefix() + "-merge-" + suffix; List<File> files = new ArrayList<>(); files.addAll(fileSet); File outputFile = new File(outputDir, suffix); JellyfishMergeV11 jellyfishMerge = this.makeJellyfishMerge(files, outputFile, args.getOrganism()); ExecutionResult id = this.conanExecutorService.executeProcess( jellyfishMerge, args.getOutputDir(), jobName, args.getThreadsPerProcess(), args.getMemoryPerProcess(), args.isRunParallel()); id.setName("merge-" + suffix); return new JobOutput(id, outputFile); }
@Override public ExecutionResult execute(ExecutionContext executionContext) throws ProcessExecutionException, InterruptedException { try { StopWatch stopWatch = new StopWatch(); stopWatch.start(); log.info("Starting Kmer Counting on all Reads"); // Create shortcut to args for convienience Args args = this.getArgs(); // Force run parallel to false if not using a scheduler if (!executionContext.usingScheduler() && args.isRunParallel()) { log.warn("Forcing linear execution due to lack of job scheduler"); args.setRunParallel(false); } // Create the output directory args.getOutputDir().mkdirs(); JobOutputMap jfCountOutputs = new JobOutputMap(); List<ExecutionResult> jobResults = new ArrayList<>(); List<ExecutionResult> allJobResults = new ArrayList<>(); // Create the output directory for the RAW datasets File rawOutputDir = new File(args.getOutputDir(), "raw"); if (!rawOutputDir.exists()) { rawOutputDir.mkdirs(); } // Start jellyfish on all RAW datasets for (Library lib : args.getAllLibraries()) { // Execute jellyfish and add id to list of job ids JobOutput jfOut = this.executeJellyfishCount(args, "raw", args.getOutputDir(), lib); jobResults.add(jfOut.getResult()); allJobResults.add(jfOut.getResult()); jfCountOutputs.updateTracker("raw", jfOut.getOutputFile()); } // Also start jellyfish on all the prep-processed libraries from MECQ if (args.getAllMecqs() != null) { for (Mecq.EcqArgs ecqArgs : args.getAllMecqs()) { // Create the output directory for the RAW datasets File ecqOutputDir = new File(args.getOutputDir(), ecqArgs.getName()); if (!ecqOutputDir.exists()) { ecqOutputDir.mkdirs(); } for (Library lib : ecqArgs.getOutputLibraries()) { // Add jellyfish id to list of job ids JobOutput jfOut = this.executeJellyfishCount(args, ecqArgs.getName(), args.getOutputDir(), lib); jobResults.add(jfOut.getResult()); allJobResults.add(jfOut.getResult()); jfCountOutputs.updateTracker(ecqArgs.getName(), jfOut.getOutputFile()); } } } // If we're using a scheduler and we have been asked to run each job // in parallel, then we should wait for all those to complete before continueing. if (executionContext.usingScheduler() && args.isRunParallel()) { log.info("Kmer counting all ECQ groups in parallel, waiting for completion"); this.conanExecutorService.executeScheduledWait( jobResults, args.getJobPrefix() + "-count-*", ExitStatus.Type.COMPLETED_ANY, args.getJobPrefix() + "-kmer-count-wait", args.getOutputDir()); } // Waiting point... clear job ids. jobResults.clear(); JobOutputMap mergedOutputs = new JobOutputMap(); // Now execute merge jobs if required for (Map.Entry<String, Set<File>> entry : jfCountOutputs.entrySet()) { String ecqName = entry.getKey(); Set<File> fileSet = entry.getValue(); // Only merge if there's more than one library if (fileSet.size() > 1) { JobOutput jfOut = this.executeJellyfishMerger( args, ecqName, fileSet, new File(args.getOutputDir(), ecqName)); jobResults.add(jfOut.getResult()); allJobResults.add(jfOut.getResult()); mergedOutputs.updateTracker(ecqName, jfOut.getOutputFile()); } } // If we're using a scheduler and we have been asked to run each job // in parallel, then we should wait for all those to complete before continueing. if (executionContext.usingScheduler() && args.isRunParallel()) { log.info( "Creating merged kmer counts for all ECQ groups in parallel, waiting for completion"); this.conanExecutorService.executeScheduledWait( jobResults, args.getJobPrefix() + "-merge-*", ExitStatus.Type.COMPLETED_ANY, args.getJobPrefix() + "-kmer-merge-wait", args.getOutputDir()); } // Waiting point... clear job ids. jobResults.clear(); // Combine all jellyfish out maps jfCountOutputs.combine(mergedOutputs); String katGcpJobPrefix = args.getJobPrefix() + "-kat-gcp"; // Run KAT GCP on everything List<ExecutionResult> katGcpResults = this.executeKatGcp( jfCountOutputs, katGcpJobPrefix, args.getThreadsPerProcess(), args.getMemoryPerProcess(), args.isRunParallel()); for (ExecutionResult result : katGcpResults) { result.setName(result.getName().substring(args.getJobPrefix().length() + 1)); jobResults.add(result); allJobResults.add(result); } // If we're using a scheduler and we have been asked to run each job // in parallel, then we should wait for all those to complete before continueing. if (executionContext.usingScheduler() && args.isRunParallel()) { log.info("Running \"kat gcp\" for all ECQ groups in parallel, waiting for completion"); this.conanExecutorService.executeScheduledWait( jobResults, katGcpJobPrefix + "*", ExitStatus.Type.COMPLETED_ANY, args.getJobPrefix() + "-kat-gcp-wait", args.getOutputDir()); } // Waiting point... clear job ids. jobResults.clear(); log.info("Kmer counting of all reads finished."); stopWatch.stop(); TaskResult taskResult = new DefaultTaskResult( "rampart-read_analysis-kmer", true, allJobResults, stopWatch.getTime() / 1000L); // Output the resource usage to file FileUtils.writeLines( new File(args.getOutputDir(), args.getJobPrefix() + ".summary"), taskResult.getOutput()); return new DefaultExecutionResult( taskResult.getTaskName(), 0, new String[] {}, null, -1, new ResourceUsage( taskResult.getMaxMemUsage(), taskResult.getActualTotalRuntime(), taskResult.getTotalExternalCputime())); } catch (ConanParameterException | IOException e) { throw new ProcessExecutionException(-1, e); } }