/** * Does all annotations for the workflow. * * @param parents * @return */ private List<Job> doAnnotations(Job... parents) { List<Job> finalAnnotatorJobs = new ArrayList<Job>(); Predicate<String> isExtractedSNV = p -> p.contains("extracted-snv") && p.endsWith(".vcf.gz"); final String passFilteredOxoGSuffix = ".pass-filtered.oxoG.vcf.gz"; // list filtering should only ever produce one result. for (int i = 0; i < this.tumours.size(); i++) { TumourInfo tInf = this.tumours.get(i); String tumourAliquotID = tInf.getAliquotID(); PcawgAnnotatorJobGenerator generator = new PcawgAnnotatorJobGenerator( this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.JSONfileName); generator.setGitMoveTestMode(this.gitMoveTestMode); generator.setAllowMissingFiles(this.allowMissingFiles); generator.setBroadOxogSNVFileName( this.filesForUpload .stream() .filter( p -> ((p.contains(tumourAliquotID) && p.contains("broad-mutect") && p.endsWith(passFilteredOxoGSuffix)))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setBroadOxoGSNVFromIndelFileName( this.filesForUpload .stream() .filter(p -> (p.contains(Pipeline.broad.toString()) && isExtractedSNV.test(p))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setSangerOxogSNVFileName( this.filesForUpload .stream() .filter( p -> ((p.contains(tumourAliquotID) && p.contains("svcp_") && p.endsWith(passFilteredOxoGSuffix)))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setSangerOxoGSNVFromIndelFileName( this.filesForUpload .stream() .filter(p -> (p.contains(Pipeline.sanger.toString()) && isExtractedSNV.test(p))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setDkfzEmbleOxogSNVFileName( this.filesForUpload .stream() .filter( p -> ((p.contains(tumourAliquotID) && p.contains("dkfz-snvCalling") && p.endsWith(passFilteredOxoGSuffix)))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setDkfzEmblOxoGSNVFromIndelFileName( this.filesForUpload .stream() .filter(p -> (p.contains(Pipeline.dkfz_embl.toString()) && isExtractedSNV.test(p))) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); // Remember: MUSE files do not get PASS-filtered. Also, there is no INDEL so there cannot be // any SNVs extracted from INDELs. generator.setMuseOxogSNVFileName( this.filesForUpload .stream() .filter(p -> p.toUpperCase().contains("MUSE") && p.endsWith(".oxoG.vcf.gz")) .findFirst() .orElseGet(emptyStringWhenMissingFilesAllowed)); generator.setNormalizedBroadIndel( this.normalizedIndels .stream() .filter(CommonPredicates.isBroad.and(matchesTumour(tumourAliquotID))) .findFirst() .orElse(new VcfInfo()) .getFileName()); generator.setNormalizedDkfzEmblIndel( this.normalizedIndels .stream() .filter(CommonPredicates.isDkfzEmbl.and(matchesTumour(tumourAliquotID))) .findFirst() .orElse(new VcfInfo()) .getFileName()); generator.setNormalizedSangerIndel( this.normalizedIndels .stream() .filter(CommonPredicates.isSanger.and(matchesTumour(tumourAliquotID))) .findFirst() .orElse(new VcfInfo()) .getFileName()); List<Job> jobs = generator.doAnnotations( this, tInf.getAliquotID(), tInf.getTumourMinibamPath(), this.normalMinibamPath, this.updateFilesForUpload, parents); finalAnnotatorJobs.addAll(jobs); } return finalAnnotatorJobs; }
/** Build the workflow!! */ @Override public void buildWorkflow() { try { this.init(); // Pull the repo. Job configJob = GitUtils.gitConfig(this.getWorkflow(), this.GITname, this.GITemail); Job copy = this.copyCredentials(configJob); Job pullRepo = GitUtils.pullRepo( this.getWorkflow(), this.GITPemFile, this.JSONrepo, this.JSONrepoName, this.JSONlocation); pullRepo.addParent(copy); Job installTools = this.installTools(copy); // indicate job is in downloading stage. String pathToScripts = this.getWorkflowBaseDir() + "/scripts"; Job move2download = GitUtils.gitMove( "queued-jobs", "downloading-jobs", this.getWorkflow(), this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.GITname, this.GITemail, this.gitMoveTestMode, this.JSONfileName, pathToScripts, installTools, pullRepo); Job move2running; if (!skipDownload) { move2running = doDownload(pathToScripts, move2download); } else { // If user is skipping download, then we will just move directly to runnning... move2running = GitUtils.gitMove( "downloading-jobs", "running-jobs", this.getWorkflow(), this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.GITname, this.GITemail, this.gitMoveTestMode, this.JSONfileName, pathToScripts, move2download); } Job statFiles = statInputFiles(move2running); Job sangerPassFilter = this.passFilterWorkflow(Pipeline.sanger, statFiles); Job broadPassFilter = this.passFilterWorkflow(Pipeline.broad, statFiles); Job dkfzemblPassFilter = this.passFilterWorkflow(Pipeline.dkfz_embl, statFiles); Job smufinPassFilter = this.passFilterWorkflow(Pipeline.smufin, statFiles); // ...No, we're not going to filter the Muse SNV file. // update all filenames to include ".pass-filtered." Function<String, String> addPassFilteredSuffix = (x) -> { return x.replace(".vcf.gz", ".pass-filtered.vcf.gz"); }; for (VcfInfo vInfo : this.vcfs) { // ...except for MUSE filenames. if (vInfo.getOriginatingPipeline() != Pipeline.muse) { vInfo.setFileName(addPassFilteredSuffix.apply(vInfo.getFileName())); } } // OxoG will run after move2running. Move2running will run after all the jobs that perform // input file downloads and file preprocessing have finished. List<Job> preprocessIndelsJobs = new ArrayList<Job>(this.tumours.size() * 3); for (int i = 0; i < this.tumours.size(); i++) { String tumourAliquotID = tumours.get(i).getAliquotID(); final String vcfNotFoundToken = "VCF_NOT_FOUND"; BiFunction<String, Predicate<VcfInfo>, String> generateVcfName = (s, p) -> "/" + s + "/" + this.vcfs .stream() .filter(vcfMatchesTypePipelineTumour(isIndel, p, tumourAliquotID)) .map(m -> m.getFileName()) .findFirst() .orElse(vcfNotFoundToken); String sangerIndelVcfName = generateVcfName.apply(this.sangerGnosID, CommonPredicates.isSanger); if (!sangerIndelVcfName.endsWith(vcfNotFoundToken)) { Job sangerPreprocessVCF = this.preProcessIndelVCF( sangerPassFilter, Pipeline.sanger, sangerIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(sangerPreprocessVCF); } String dkfzEmblIndelVcfName = generateVcfName.apply(this.dkfzemblGnosID, CommonPredicates.isDkfzEmbl); if (!dkfzEmblIndelVcfName.endsWith(vcfNotFoundToken)) { Job dkfzEmblPreprocessVCF = this.preProcessIndelVCF( dkfzemblPassFilter, Pipeline.dkfz_embl, dkfzEmblIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(dkfzEmblPreprocessVCF); } String broadIndelVcfName = generateVcfName.apply(this.broadGnosID, CommonPredicates.isBroad); if (!broadIndelVcfName.endsWith(vcfNotFoundToken)) { Job broadPreprocessVCF = this.preProcessIndelVCF( broadPassFilter, Pipeline.broad, broadIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(broadPreprocessVCF); } // smufin INDEL VCFs will be in /datastore/vcf/smufin - they will not be nested in a GNOS // ID-named directory. String smufinIndelVcfName = generateVcfName.apply("", CommonPredicates.isSmufin); if (!smufinIndelVcfName.endsWith(vcfNotFoundToken)) { Job smufinPreprocessVCF = this.preProcessIndelVCF( smufinPassFilter, Pipeline.smufin, smufinIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(smufinPreprocessVCF); } } // TODO: This probably doesn't need to be a list anymore. List<Job> combineVCFJobs = new ArrayList<Job>(this.tumours.size()); Job combineVCFJob = this.combineVCFsByType( preprocessIndelsJobs.toArray(new Job[preprocessIndelsJobs.size()])); combineVCFJobs.add(combineVCFJob); List<Job> oxogJobs = new ArrayList<Job>(this.tumours.size()); for (int i = 0; i < this.tumours.size(); i++) { TumourInfo tInf = this.tumours.get(i); Job oxoG; if (i > 0) { // OxoG jobs can put a heavy CPU load on the system (in bursts) so probably better to run // them in sequence. // If there is > 1 OxoG job (i.e. a multi-tumour donor), each OxoG job should have the // prior OxoG job as its parent. oxoG = this.doOxoG( tInf.getTumourBamGnosID() + "/" + tInf.getTumourBAMFileName(), tInf.getAliquotID(), oxogJobs.get(i - 1)); } else { oxoG = this.doOxoG( tInf.getTumourBamGnosID() + "/" + tInf.getTumourBAMFileName(), tInf.getAliquotID()); } for (int j = 0; j < combineVCFJobs.size(); j++) { oxoG.addParent(combineVCFJobs.get(j)); } oxogJobs.add(oxoG); } String pathToNormalMinibam = "/datastore/bam/normal/" + (DownloadMethod.valueOf(this.bamDownloadMethod) != DownloadMethod.filesystemCopy ? this.normalBamGnosID + "/" : "") + this.normalBAMFileName; Job normalVariantBam = this.doVariantBam( BAMType.normal, pathToNormalMinibam, this.normalBAMFileName, this.normalAliquotID, combineVCFJobs.toArray(new Job[combineVCFJobs.size()])); List<Job> parentJobsToAnnotationJobs = new ArrayList<Job>(this.tumours.size()); // create a list of tumour variant-bam jobs. List<Job> variantBamJobs = new ArrayList<Job>(this.tumours.size() + 1); for (int i = 0; i < this.tumours.size(); i++) { TumourInfo tInfo = this.tumours.get(i); String pathToTumourMinibam = "/datastore/bam/tumour/" + (DownloadMethod.valueOf(this.bamDownloadMethod) != DownloadMethod.filesystemCopy ? tInfo.getTumourBamGnosID() + "/" : "") + tInfo.getTumourBAMFileName(); Job tumourVariantBam = this.doVariantBam( BAMType.tumour, pathToTumourMinibam, tInfo.getTumourBAMFileName(), tInfo.getAliquotID(), combineVCFJobs.toArray(new Job[combineVCFJobs.size()])); variantBamJobs.add(tumourVariantBam); } variantBamJobs.add(normalVariantBam); // Now that we've built our list of variantbam and oxog jobs, we can set up the proper // parent-child relationships between them. // The idea is to run 1 OxoG at the same time as 2 variantbam jobs. for (int i = 2; i < Math.max(variantBamJobs.size(), variantBamJobs.size()); i += 2) { variantBamJobs.get(i).addParent(variantBamJobs.get(i - 2)); if (i + 1 < variantBamJobs.size()) { variantBamJobs.get(i + 1).addParent(variantBamJobs.get(i - 2)); } } Job minibamSanityCheck = this.getWorkflow().createBashJob("Check minibams"); String moveToFailed = GitUtils.gitMoveCommand( "running-jobs", "failed-jobs", this.JSONlocation + "/" + this.JSONrepoName + "/" + this.JSONfolderName, this.JSONfileName, this.gitMoveTestMode, this.getWorkflowBaseDir() + "/scripts/"); // A list of all pass-filtered files that the minibams will be checked against. // NOTE: muse will not have any pass-filtered files and smufin will only have indel (or // SNV-from-indel) files. String filesToCheck = this.vcfs .stream() .filter( p -> p.getFileName().endsWith(".vcf.gz") && p.getFileName().contains("pass-filter")) .map( m -> "/datastore/vcf/" + m.getOriginatingPipeline().toString() + "/" + (m.getOriginatingPipeline() != Pipeline.smufin ? m.getPipelineGnosID() + "/" : "/") + m.getFileName()) .sorted() .reduce("", (a, b) -> a += " " + b); minibamSanityCheck.setCommand( "(bash " + pathToScripts + "/check_minibams.sh " + filesToCheck + ") || " + moveToFailed); variantBamJobs.stream().forEach(job -> minibamSanityCheck.addParent(job)); parentJobsToAnnotationJobs.add(minibamSanityCheck); // set up parent jobs to annotation jobs oxogJobs.stream().forEach(job -> parentJobsToAnnotationJobs.add(job)); List<Job> annotationJobs = new ArrayList<Job>(); if (!this.skipAnnotation) { annotationJobs = this.doAnnotations( parentJobsToAnnotationJobs.toArray(new Job[parentJobsToAnnotationJobs.size()])); } // Now do the Upload. The parents jobs are the Annotation jobs, unless the user set // skipAnnotations=true, so there will // not be any annotation jobs. In that case Upload's parents are the jobs that would have been // parents to Annotation. Job[] parentsToUpload = (annotationJobs != null && annotationJobs.size() > 0) ? annotationJobs.toArray(new Job[annotationJobs.size()]) : parentJobsToAnnotationJobs.toArray(new Job[parentJobsToAnnotationJobs.size()]); // indicate job is in uploading stage. Job move2uploading = this.gitMove("running-jobs", "uploading-jobs", parentsToUpload); Job uploadResults = doUpload(move2uploading); // indicate job is complete. this.gitMove("uploading-jobs", "completed-jobs", uploadResults); // System.out.println(this.filesForUpload); } catch (Exception e) { throw new RuntimeException("Exception caught: " + e.getMessage(), e); } }
/** * Runs the variant program inside the Broad's OxoG container to produce a mini-BAM for a given * BAM. * * @param parent * @param bamType - The type of BAM file to use. Determines the name of the output file. * @param bamPath - The path to the input BAM file. * @param tumourBAMFileName - Name of the BAM file. Only used if bamType == BAMType.tumour. * @param aliquotID * @return */ private Job doVariantBam( BAMType bamType, String bamPath, String tumourBAMFileName, String aliquotID, Job... parents) { Job runVariantbam; if (!this.skipVariantBam) { VariantBamJobGenerator variantBamJobGenerator = new VariantBamJobGenerator( this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.JSONfileName); variantBamJobGenerator.setGitMoveTestMode(this.gitMoveTestMode); variantBamJobGenerator.setIndelPadding(String.valueOf(this.indelPadding)); variantBamJobGenerator.setSnvPadding(String.valueOf(this.snvPadding)); variantBamJobGenerator.setSvPadding(String.valueOf(this.svPadding)); variantBamJobGenerator.setGitMoveTestMode(this.gitMoveTestMode); variantBamJobGenerator.setAliquotID(aliquotID); variantBamJobGenerator.setSnvVcf( mergedVcfs.stream().filter(isSnv).findFirst().get().getFileName()); variantBamJobGenerator.setSvVcf( mergedVcfs.stream().filter(isSv).findFirst().get().getFileName()); variantBamJobGenerator.setIndelVcf( mergedVcfs.stream().filter(isIndel).findFirst().get().getFileName()); UpdateBamForUpload<String, String, Boolean> updateFilesForUpload = (path, id, isLink) -> { if (id == null || id.trim().equals("")) { // only update the normalMinibamPath with the path to the actual BAM. // If you get a .bai file here, add it to filesForUpload, // but don't do anything else. // Also, if what was passed in has been indicated as a symlink, just add // to filesForUpload but do nothing else. if (path.endsWith(".bam") && !isLink) { this.normalMinibamPath = path; } this.filesForUpload.add(path); } else { for (TumourInfo tInfo : this.tumours) { if (tInfo.getAliquotID().equals(id)) { // Set the tumour minibam path only to the BAM file. // If you get a .bai file here, add it to filesForUpload, // but don't do anything else. // Also, if what was passed in has been indicated as a symlink, just add // to filesForUpload but do nothing else. if (path.endsWith(".bam") && !isLink) { tInfo.setTumourMinibamPath(path); } filesForUpload.add(path); } } } }; String bamName = (bamType == BAMType.normal ? this.normalBAMFileName : tumourBAMFileName); runVariantbam = variantBamJobGenerator.doVariantBam( this, bamType, bamName, bamPath, tumourBAMFileName, updateFilesForUpload, parents); } else { runVariantbam = this.getWorkflow() .createBashJob( "run " + bamType + (bamType == BAMType.tumour ? "_" + aliquotID + "_" : "") + " variantbam"); Arrays.stream(parents).forEach(parent -> runVariantbam.addParent(parent)); } return runVariantbam; }