protected String makeRegionFile( Context context, ChromosomeRange r, PreprocessingTools tools, String region) throws URISyntaxException, IOException, InterruptedException { // if exome dont do but for exome filter on exomeBedFile if (filterBedFile == null) { r.writeToPicardRegionFile(region); } else { String exomebed = tmpFileBase + "exome.bed"; if (filterBedFile.endsWith(".gz")) { exomebed += ".gz"; } HalvadeFileUtils.downloadFileFromHDFS( context, FileSystem.get(new URI(filterBedFile), context.getConfiguration()), filterBedFile, exomebed); if (exomebed.endsWith(".gz")) { exomebed = HalvadeFileUtils.Unzip(exomebed); } region = tools.filterExomeBed(exomebed, r); if (region == null) { Logger.DEBUG("empty region file, no vcf results!!"); return null; } HalvadeFileUtils.removeLocalFile(keep, exomebed); } return region; }
protected void splitNTrim( Context context, String region, GATKTools gatk, String input, String output) throws InterruptedException { Logger.DEBUG("run SplitNCigarReads"); context.setStatus("run SplitNCigarReads"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runSplitNCigarReads(input, output, ref, region, newMaxQualScore); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai")); }
protected void elPrepPreprocess( Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, IOException, QualityException, URISyntaxException { String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict"; String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM); String preSamOut = tmpFileBase + "-p1.sam"; String samOut = tmpFileBase + "-p2.sam"; String fCounts = tmpFileBase + "-features.count"; outHeader = header.clone(); outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); Logger.DEBUG("call elPrep"); context.setStatus("call elPrep"); int reads; if (keep) { reads = tools.callElPrep( preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } else { reads = tools.streamElPrep( context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } Logger.DEBUG(reads + " reads processed in elPrep"); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); if (gff != null) { Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, samOut, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS( context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(samOut, output, threads); context.setStatus("build bam index"); Logger.DEBUG("build bam index"); tools.runBuildBamIndex(output); // remove temporary files HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
protected void RnaVariantCalling( Context context, String region, GATKTools gatk, String input, String output) throws InterruptedException { // choose between unifiendgenotyper vs haplotypegenotyper Logger.DEBUG("run variantCaller"); context.setStatus("run variantCaller"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runHaplotypeCaller(input, output, true, scc, sec, ref, null, region); context.setStatus("cleanup"); context.getCounter(HalvadeCounters.OUT_VCF_FILES).increment(1); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai")); }
private BWAAlnInstance(Mapper.Context context, String bin) throws IOException, URISyntaxException { super(context, bin); taskId = context.getTaskAttemptID().toString(); taskId = taskId.substring(taskId.indexOf("m_")); ref = HalvadeFileUtils.downloadBWAIndex(context, taskId); alnCustomArgs = HalvadeConf.getCustomArgs(context.getConfiguration(), "bwa", "aln"); }
protected void annotateVariants( Context context, String region, GATKTools gatk, String input, String output) throws InterruptedException { Logger.DEBUG("run VariantAnnotator"); context.setStatus("run VariantAnnotator"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runVariantAnnotator(input, output, ref, region); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); }
// TODO improve annotate/filter protected void filterVariants( Context context, String region, GATKTools gatk, String input, String output) throws InterruptedException { Logger.DEBUG("run VariantFiltration"); context.setStatus("run VariantFiltration"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runVariantFiltration(input, output, ref, region, windows, cluster, minFS, maxQD); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); }
protected void indelRealignment( Context context, String region, GATKTools gatk, String input, String output) throws InterruptedException { String targets = tmpFileBase + ".intervals"; Logger.DEBUG("run RealignerTargetCreator"); context.setStatus("run RealignerTargetCreator"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runRealignerTargetCreator(input, targets, ref, region); Logger.DEBUG("run IndelRealigner"); context.setStatus("run IndelRealigner"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runIndelRealigner(input, targets, output, ref, region); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai")); HalvadeFileUtils.removeLocalFile(keep, targets, context, HalvadeCounters.FOUT_GATK_TMP); }
@Override public void closeAligner() throws InterruptedException { // close last BWA aln closeBWAAln(); // but now start sampe/samse startBWASamXe(); ssh.join(); int error = samxe.waitForCompletion(); if (error != 0) throw new ProcessException("BWA samXe", error); context.getCounter(HalvadeCounters.TIME_BWA_SAMPE).increment(samxe.getExecutionTime()); // remove all temporary fastq/sai files HalvadeFileUtils.removeLocalFile( keep, getFileName(tmpdir, taskId, true, 1), context, HalvadeCounters.FOUT_BWA_TMP); HalvadeFileUtils.removeLocalFile( keep, getFileName(tmpdir, taskId, false, 1), context, HalvadeCounters.FOUT_BWA_TMP); HalvadeFileUtils.removeLocalFile( keep, getFileName(tmpdir, taskId, true, 2), context, HalvadeCounters.FOUT_BWA_TMP); HalvadeFileUtils.removeLocalFile( keep, getFileName(tmpdir, taskId, false, 2), context, HalvadeCounters.FOUT_BWA_TMP); instance = null; }
protected void baseQualityScoreRecalibration( Context context, String region, ChromosomeRange r, PreprocessingTools tools, GATKTools gatk, String input, String output) throws InterruptedException, IOException, URISyntaxException { String table = tmpFileBase + ".table"; // get snp database(s) String[] snpslocal = HalvadeFileUtils.downloadSites(context, taskId); String[] newKnownSites = new String[snpslocal.length]; for (int i = 0; i < snpslocal.length; i++) { if (filterDBsnp) { newKnownSites[i] = tools.filterDBSnps( ref.replaceAll("fasta", "dict"), snpslocal[i], r, tmpFileBase, threads); } else { newKnownSites[i] = snpslocal[i]; } if (newKnownSites[i].endsWith(".gz")) { newKnownSites[i] = HalvadeFileUtils.Unzip(newKnownSites[i]); } } // should be created automatically by GATK v3.0 or higher // Logger.DEBUG("build bam index"); // context.setStatus("build bam index"); // tools.runBuildBamIndex(tmpFile1); Logger.DEBUG("run baseRecalibrator"); context.setStatus("run baseRecalibrator"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runBaseRecalibrator(input, table, ref, newKnownSites, region); Logger.DEBUG("run printReads"); context.setStatus("run printReads"); context.getCounter(HalvadeCounters.TOOLS_GATK).increment(1); gatk.runPrintReads(input, output, ref, table, region); HalvadeFileUtils.removeLocalFile(keep, input, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, input.replaceAll(".bam", ".bai")); HalvadeFileUtils.removeLocalFile(keep, table, context, HalvadeCounters.FOUT_GATK_TMP); for (int i = 0; i < newKnownSites.length; i++) { if (filterDBsnp) { HalvadeFileUtils.removeLocalFile( keep, newKnownSites[i], context, HalvadeCounters.FOUT_GATK_TMP); } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); isFirstAttempt = taskId.endsWith("_0"); isRNA = HalvadeConf.getIsRNA(context.getConfiguration()); scc = HalvadeConf.getSCC(context.getConfiguration(), isRNA); sec = HalvadeConf.getSEC(context.getConfiguration(), isRNA); try { gff = HalvadeFileUtils.downloadGFF(context, taskId); } catch (URISyntaxException ex) { Logger.EXCEPTION(ex); throw new InterruptedException("Error when downloading GFF file"); } filterBedFile = HalvadeConf.getFilterBed(context.getConfiguration()); filterDBsnp = HalvadeConf.getFilterDBSnp(context.getConfiguration()); useUnifiedGenotyper = HalvadeConf.getUseUnifiedGenotyper(context.getConfiguration()); redistribute = HalvadeConf.getRedistribute(context.getConfiguration()); containers = HalvadeConf.getMapContainerCount(context.getConfiguration()); tasksLeft = Integer.parseInt(context.getConfiguration().get("mapred.map.tasks")) - taskNr; // get task number: if (redistribute && tasksLeft < containers) { threads = 6; } }
protected void PicardPreprocess( Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, QualityException, IOException, URISyntaxException { outHeader = header.clone(); outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); // tmp files String tmpOut1 = tmpFileBase + "-p1.bam"; String tmpOut2 = tmpFileBase + "-p2.bam"; String tmpOut3 = tmpFileBase + "-p3.sam"; String fCounts = tmpFileBase + "-features.count"; String tmpMetrics = tmpFileBase + "-p3-metrics.txt"; SAMFileWriterFactory factory = new SAMFileWriterFactory(); if (!inputIsBam) { outHeader.addReadGroup(bamrg); } SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1)); long startTime = System.currentTimeMillis(); int count = 0; SAMRecord sam; while (input.hasNext()) { sam = input.next(); writer.addAlignment(sam); count++; } int reads = input.getCount(); writer.close(); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); long estimatedTime = System.currentTimeMillis() - startTime; context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime); Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000); Logger.DEBUG("clean sam"); context.setStatus("clean sam"); tools.runCleanSam(tmpOut1, tmpOut2); Logger.DEBUG("mark duplicates"); context.setStatus("mark duplicates"); tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics); if (gff != null) { // tmpOut3 is sam for htseq count! Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, tmpOut3, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS( context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } if (!inputIsBam) { Logger.DEBUG("add read-group"); context.setStatus("add read-group"); tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM); } else { context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(tmpOut3, output, threads); } Logger.DEBUG("build bam index"); context.setStatus("build bam index"); tools.runBuildBamIndex(output); estimatedTime = System.currentTimeMillis() - startTime; Logger.DEBUG("estimated time: " + estimatedTime / 1000); // remove all temporary files now! HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }