/* * Yes, install tabix as a part of the workflow. It's not in the seqware_whitestar or seqware_whitestar_pancancer container, so * install it here. */ private Job installTools(Job parent) { Job installTabixJob = this.getWorkflow().createBashJob("install tools"); installTabixJob.setCommand("sudo apt-get install tabix libstring-random-perl -y "); installTabixJob.addParent(parent); return installTabixJob; }
private Job statInputFiles(Job parent) { Job statFiles = this.getWorkflow().createBashJob("stat downloaded input files"); String moveToFailed = GitUtils.gitMoveCommand( "running-jobs", "failed-jobs", this.JSONlocation + "/" + this.JSONrepoName + "/" + this.JSONfolderName, this.JSONfileName, this.gitMoveTestMode, this.getWorkflowBaseDir() + "/scripts/"); String statFilesCMD = "( "; for (VcfInfo vcfInfo : this.vcfs) { boolean useGnosIdInPath = vcfInfo.getOriginatingPipeline() != Pipeline.smufin && this.pipelineDownloadMethods.get(vcfInfo.getOriginatingPipeline()) != DownloadMethod.filesystemCopy; // Remember: smufin won't have a gnos ID so don't try to use that in the path for stat. String prefix = "stat /datastore/vcf/" + vcfInfo.getOriginatingPipeline().toString() + "/" + (useGnosIdInPath ? vcfInfo.getPipelineGnosID() + "/" : ""); statFilesCMD += prefix + vcfInfo.getFileName() + " && \\\n"; statFilesCMD += prefix + vcfInfo.getIndexFileName() + " && \\\n"; } // stat all tumour BAMS for (int i = 0; i < this.tumours.size(); i++) { String prefix = "stat /datastore/bam/" + BAMType.tumour.toString() + "/" + (!this.bamDownloadMethod.equals(DownloadMethod.filesystemCopy.toString()) ? this.tumours.get(i).getTumourBamGnosID() + "/" : ""); statFilesCMD += prefix + this.tumours.get(i).getTumourBAMFileName() + " && \\\n"; statFilesCMD += prefix + this.tumours.get(i).getTumourBamIndexFileName() + " && \\\n"; } String normalPrefix = "stat /datastore/bam/" + BAMType.normal.toString() + "/" + (!this.bamDownloadMethod.equals(DownloadMethod.filesystemCopy.toString()) ? this.normalBamGnosID + "/" : ""); statFilesCMD += normalPrefix + this.normalBAMFileName + " && \\\n"; statFilesCMD += normalPrefix + this.normalBamIndexFileName + " \\\n"; statFilesCMD += " ) || " + moveToFailed; statFiles.setCommand(statFilesCMD); statFiles.addParent(parent); return statFiles; }
/** * Copy the credentials files from ~/.gnos to /datastore/credentials * * @param parentJob * @return */ private Job copyCredentials(Job parentJob) { Job copy = this.getWorkflow().createBashJob("copy /home/ubuntu/.gnos"); copy.setCommand( "mkdir /datastore/credentials && cp -r /home/ubuntu/.gnos/* /datastore/credentials && ls -l /datastore/credentials"); copy.addParent(parentJob); if (this.vcfDownloadMethod.equals(DownloadMethod.s3.toString())) { Job s3Setup = this.getWorkflow().createBashJob("s3 credentials setup"); s3Setup.setCommand( "mkdir ~/.aws && cp /datastore/credentials/aws_credentials ~/.aws/credentials"); s3Setup.addParent(copy); return s3Setup; } else { return copy; } }
/** Build the workflow!! */ @Override public void buildWorkflow() { try { this.init(); // Pull the repo. Job configJob = GitUtils.gitConfig(this.getWorkflow(), this.GITname, this.GITemail); Job copy = this.copyCredentials(configJob); Job pullRepo = GitUtils.pullRepo( this.getWorkflow(), this.GITPemFile, this.JSONrepo, this.JSONrepoName, this.JSONlocation); pullRepo.addParent(copy); Job installTools = this.installTools(copy); // indicate job is in downloading stage. String pathToScripts = this.getWorkflowBaseDir() + "/scripts"; Job move2download = GitUtils.gitMove( "queued-jobs", "downloading-jobs", this.getWorkflow(), this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.GITname, this.GITemail, this.gitMoveTestMode, this.JSONfileName, pathToScripts, installTools, pullRepo); Job move2running; if (!skipDownload) { move2running = doDownload(pathToScripts, move2download); } else { // If user is skipping download, then we will just move directly to runnning... move2running = GitUtils.gitMove( "downloading-jobs", "running-jobs", this.getWorkflow(), this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.GITname, this.GITemail, this.gitMoveTestMode, this.JSONfileName, pathToScripts, move2download); } Job statFiles = statInputFiles(move2running); Job sangerPassFilter = this.passFilterWorkflow(Pipeline.sanger, statFiles); Job broadPassFilter = this.passFilterWorkflow(Pipeline.broad, statFiles); Job dkfzemblPassFilter = this.passFilterWorkflow(Pipeline.dkfz_embl, statFiles); Job smufinPassFilter = this.passFilterWorkflow(Pipeline.smufin, statFiles); // ...No, we're not going to filter the Muse SNV file. // update all filenames to include ".pass-filtered." Function<String, String> addPassFilteredSuffix = (x) -> { return x.replace(".vcf.gz", ".pass-filtered.vcf.gz"); }; for (VcfInfo vInfo : this.vcfs) { // ...except for MUSE filenames. if (vInfo.getOriginatingPipeline() != Pipeline.muse) { vInfo.setFileName(addPassFilteredSuffix.apply(vInfo.getFileName())); } } // OxoG will run after move2running. Move2running will run after all the jobs that perform // input file downloads and file preprocessing have finished. List<Job> preprocessIndelsJobs = new ArrayList<Job>(this.tumours.size() * 3); for (int i = 0; i < this.tumours.size(); i++) { String tumourAliquotID = tumours.get(i).getAliquotID(); final String vcfNotFoundToken = "VCF_NOT_FOUND"; BiFunction<String, Predicate<VcfInfo>, String> generateVcfName = (s, p) -> "/" + s + "/" + this.vcfs .stream() .filter(vcfMatchesTypePipelineTumour(isIndel, p, tumourAliquotID)) .map(m -> m.getFileName()) .findFirst() .orElse(vcfNotFoundToken); String sangerIndelVcfName = generateVcfName.apply(this.sangerGnosID, CommonPredicates.isSanger); if (!sangerIndelVcfName.endsWith(vcfNotFoundToken)) { Job sangerPreprocessVCF = this.preProcessIndelVCF( sangerPassFilter, Pipeline.sanger, sangerIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(sangerPreprocessVCF); } String dkfzEmblIndelVcfName = generateVcfName.apply(this.dkfzemblGnosID, CommonPredicates.isDkfzEmbl); if (!dkfzEmblIndelVcfName.endsWith(vcfNotFoundToken)) { Job dkfzEmblPreprocessVCF = this.preProcessIndelVCF( dkfzemblPassFilter, Pipeline.dkfz_embl, dkfzEmblIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(dkfzEmblPreprocessVCF); } String broadIndelVcfName = generateVcfName.apply(this.broadGnosID, CommonPredicates.isBroad); if (!broadIndelVcfName.endsWith(vcfNotFoundToken)) { Job broadPreprocessVCF = this.preProcessIndelVCF( broadPassFilter, Pipeline.broad, broadIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(broadPreprocessVCF); } // smufin INDEL VCFs will be in /datastore/vcf/smufin - they will not be nested in a GNOS // ID-named directory. String smufinIndelVcfName = generateVcfName.apply("", CommonPredicates.isSmufin); if (!smufinIndelVcfName.endsWith(vcfNotFoundToken)) { Job smufinPreprocessVCF = this.preProcessIndelVCF( smufinPassFilter, Pipeline.smufin, smufinIndelVcfName, this.tumours.get(i).getAliquotID()); preprocessIndelsJobs.add(smufinPreprocessVCF); } } // TODO: This probably doesn't need to be a list anymore. List<Job> combineVCFJobs = new ArrayList<Job>(this.tumours.size()); Job combineVCFJob = this.combineVCFsByType( preprocessIndelsJobs.toArray(new Job[preprocessIndelsJobs.size()])); combineVCFJobs.add(combineVCFJob); List<Job> oxogJobs = new ArrayList<Job>(this.tumours.size()); for (int i = 0; i < this.tumours.size(); i++) { TumourInfo tInf = this.tumours.get(i); Job oxoG; if (i > 0) { // OxoG jobs can put a heavy CPU load on the system (in bursts) so probably better to run // them in sequence. // If there is > 1 OxoG job (i.e. a multi-tumour donor), each OxoG job should have the // prior OxoG job as its parent. oxoG = this.doOxoG( tInf.getTumourBamGnosID() + "/" + tInf.getTumourBAMFileName(), tInf.getAliquotID(), oxogJobs.get(i - 1)); } else { oxoG = this.doOxoG( tInf.getTumourBamGnosID() + "/" + tInf.getTumourBAMFileName(), tInf.getAliquotID()); } for (int j = 0; j < combineVCFJobs.size(); j++) { oxoG.addParent(combineVCFJobs.get(j)); } oxogJobs.add(oxoG); } String pathToNormalMinibam = "/datastore/bam/normal/" + (DownloadMethod.valueOf(this.bamDownloadMethod) != DownloadMethod.filesystemCopy ? this.normalBamGnosID + "/" : "") + this.normalBAMFileName; Job normalVariantBam = this.doVariantBam( BAMType.normal, pathToNormalMinibam, this.normalBAMFileName, this.normalAliquotID, combineVCFJobs.toArray(new Job[combineVCFJobs.size()])); List<Job> parentJobsToAnnotationJobs = new ArrayList<Job>(this.tumours.size()); // create a list of tumour variant-bam jobs. List<Job> variantBamJobs = new ArrayList<Job>(this.tumours.size() + 1); for (int i = 0; i < this.tumours.size(); i++) { TumourInfo tInfo = this.tumours.get(i); String pathToTumourMinibam = "/datastore/bam/tumour/" + (DownloadMethod.valueOf(this.bamDownloadMethod) != DownloadMethod.filesystemCopy ? tInfo.getTumourBamGnosID() + "/" : "") + tInfo.getTumourBAMFileName(); Job tumourVariantBam = this.doVariantBam( BAMType.tumour, pathToTumourMinibam, tInfo.getTumourBAMFileName(), tInfo.getAliquotID(), combineVCFJobs.toArray(new Job[combineVCFJobs.size()])); variantBamJobs.add(tumourVariantBam); } variantBamJobs.add(normalVariantBam); // Now that we've built our list of variantbam and oxog jobs, we can set up the proper // parent-child relationships between them. // The idea is to run 1 OxoG at the same time as 2 variantbam jobs. for (int i = 2; i < Math.max(variantBamJobs.size(), variantBamJobs.size()); i += 2) { variantBamJobs.get(i).addParent(variantBamJobs.get(i - 2)); if (i + 1 < variantBamJobs.size()) { variantBamJobs.get(i + 1).addParent(variantBamJobs.get(i - 2)); } } Job minibamSanityCheck = this.getWorkflow().createBashJob("Check minibams"); String moveToFailed = GitUtils.gitMoveCommand( "running-jobs", "failed-jobs", this.JSONlocation + "/" + this.JSONrepoName + "/" + this.JSONfolderName, this.JSONfileName, this.gitMoveTestMode, this.getWorkflowBaseDir() + "/scripts/"); // A list of all pass-filtered files that the minibams will be checked against. // NOTE: muse will not have any pass-filtered files and smufin will only have indel (or // SNV-from-indel) files. String filesToCheck = this.vcfs .stream() .filter( p -> p.getFileName().endsWith(".vcf.gz") && p.getFileName().contains("pass-filter")) .map( m -> "/datastore/vcf/" + m.getOriginatingPipeline().toString() + "/" + (m.getOriginatingPipeline() != Pipeline.smufin ? m.getPipelineGnosID() + "/" : "/") + m.getFileName()) .sorted() .reduce("", (a, b) -> a += " " + b); minibamSanityCheck.setCommand( "(bash " + pathToScripts + "/check_minibams.sh " + filesToCheck + ") || " + moveToFailed); variantBamJobs.stream().forEach(job -> minibamSanityCheck.addParent(job)); parentJobsToAnnotationJobs.add(minibamSanityCheck); // set up parent jobs to annotation jobs oxogJobs.stream().forEach(job -> parentJobsToAnnotationJobs.add(job)); List<Job> annotationJobs = new ArrayList<Job>(); if (!this.skipAnnotation) { annotationJobs = this.doAnnotations( parentJobsToAnnotationJobs.toArray(new Job[parentJobsToAnnotationJobs.size()])); } // Now do the Upload. The parents jobs are the Annotation jobs, unless the user set // skipAnnotations=true, so there will // not be any annotation jobs. In that case Upload's parents are the jobs that would have been // parents to Annotation. Job[] parentsToUpload = (annotationJobs != null && annotationJobs.size() > 0) ? annotationJobs.toArray(new Job[annotationJobs.size()]) : parentJobsToAnnotationJobs.toArray(new Job[parentJobsToAnnotationJobs.size()]); // indicate job is in uploading stage. Job move2uploading = this.gitMove("running-jobs", "uploading-jobs", parentsToUpload); Job uploadResults = doUpload(move2uploading); // indicate job is complete. this.gitMove("uploading-jobs", "completed-jobs", uploadResults); // System.out.println(this.filesForUpload); } catch (Exception e) { throw new RuntimeException("Exception caught: " + e.getMessage(), e); } }
/** * Runs the variant program inside the Broad's OxoG container to produce a mini-BAM for a given * BAM. * * @param parent * @param bamType - The type of BAM file to use. Determines the name of the output file. * @param bamPath - The path to the input BAM file. * @param tumourBAMFileName - Name of the BAM file. Only used if bamType == BAMType.tumour. * @param aliquotID * @return */ private Job doVariantBam( BAMType bamType, String bamPath, String tumourBAMFileName, String aliquotID, Job... parents) { Job runVariantbam; if (!this.skipVariantBam) { VariantBamJobGenerator variantBamJobGenerator = new VariantBamJobGenerator( this.JSONlocation, this.JSONrepoName, this.JSONfolderName, this.JSONfileName); variantBamJobGenerator.setGitMoveTestMode(this.gitMoveTestMode); variantBamJobGenerator.setIndelPadding(String.valueOf(this.indelPadding)); variantBamJobGenerator.setSnvPadding(String.valueOf(this.snvPadding)); variantBamJobGenerator.setSvPadding(String.valueOf(this.svPadding)); variantBamJobGenerator.setGitMoveTestMode(this.gitMoveTestMode); variantBamJobGenerator.setAliquotID(aliquotID); variantBamJobGenerator.setSnvVcf( mergedVcfs.stream().filter(isSnv).findFirst().get().getFileName()); variantBamJobGenerator.setSvVcf( mergedVcfs.stream().filter(isSv).findFirst().get().getFileName()); variantBamJobGenerator.setIndelVcf( mergedVcfs.stream().filter(isIndel).findFirst().get().getFileName()); UpdateBamForUpload<String, String, Boolean> updateFilesForUpload = (path, id, isLink) -> { if (id == null || id.trim().equals("")) { // only update the normalMinibamPath with the path to the actual BAM. // If you get a .bai file here, add it to filesForUpload, // but don't do anything else. // Also, if what was passed in has been indicated as a symlink, just add // to filesForUpload but do nothing else. if (path.endsWith(".bam") && !isLink) { this.normalMinibamPath = path; } this.filesForUpload.add(path); } else { for (TumourInfo tInfo : this.tumours) { if (tInfo.getAliquotID().equals(id)) { // Set the tumour minibam path only to the BAM file. // If you get a .bai file here, add it to filesForUpload, // but don't do anything else. // Also, if what was passed in has been indicated as a symlink, just add // to filesForUpload but do nothing else. if (path.endsWith(".bam") && !isLink) { tInfo.setTumourMinibamPath(path); } filesForUpload.add(path); } } } }; String bamName = (bamType == BAMType.normal ? this.normalBAMFileName : tumourBAMFileName); runVariantbam = variantBamJobGenerator.doVariantBam( this, bamType, bamName, bamPath, tumourBAMFileName, updateFilesForUpload, parents); } else { runVariantbam = this.getWorkflow() .createBashJob( "run " + bamType + (bamType == BAMType.tumour ? "_" + aliquotID + "_" : "") + " variantbam"); Arrays.stream(parents).forEach(parent -> runVariantbam.addParent(parent)); } return runVariantbam; }