/** * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK * engine (those will be retrieved from <code>toolkit</code>). The output file names will be * generated automatically by stripping ".sam" or ".bam" off the input file name and adding ext * instead (e.g. ".cleaned.bam"). onto a unique output file name. * * @param toolkit * @param ext */ public void setupByReader( GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { for (SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs()) { String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); String outName; int pos; if (fName.toUpperCase().endsWith(".BAM")) pos = fName.toUpperCase().lastIndexOf(".BAM"); else { if (fName.toUpperCase().endsWith(".SAM")) pos = fName.toUpperCase().lastIndexOf(".SAM"); else throw new UserException.BadInput( "Input file name " + fName + " does not end with .sam or .bam"); } String prefix = fName.substring(0, pos); outName = prefix + ext; if (writerMap.containsKey(rid)) throw new StingException( "nWayOut mode: Reader id for input sam file " + fName + " is already registered"); addWriter(rid, outName, order, presorted, indexOnTheFly, generateMD5, pRecord); } }
@Requires({ "toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0", "ploidy>0" }) public UnifiedGenotyperEngine( GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples, int ploidy) { this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; genomeLocParser = toolkit.getGenomeLocParser(); this.samples = new TreeSet<String>(samples); // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than // minBQ this.UAC = UAC; this.logger = logger; this.verboseWriter = verboseWriter; this.annotationEngine = engine; this.ploidy = ploidy; this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N + 1]; log10AlleleFrequencyPriorsIndels = new double[N + 1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); filter.add(LOW_QUAL_FILTER_NAME); }
/** * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK * engine (those will be retrieved from <code>toolkit</code>). The <code>in2out</code> map must * contain an entry for each input filename and map it onto a unique output file name. * * @param toolkit * @param in2out */ public void setupByReader( GenomeAnalysisEngine toolkit, Map<String, String> in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { if (in2out == null) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); for (SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs()) { String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); String outName; if (!in2out.containsKey(fName)) throw new UserException.BadInput( "Input-output bam filename map does not contain an entry for the input file " + fName); outName = in2out.get(fName); if (writerMap.containsKey(rid)) throw new StingException( "nWayOut mode: Reader id for input sam file " + fName + " is already registered; " + "map file likely contains multiple entries for this input file"); addWriter(rid, outName, order, presorted, indexOnTheFly, generateMD5, pRecord); } }
public void addAlignment(SAMRecord samRecord) { final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); String rg = samRecord.getStringAttribute("RG"); if (rg != null) { String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); samRecord.setAttribute("RG", rg_orig); } addAlignment(samRecord, id); }
/** * Create random read qualities * * @param length the length of the read * @param allowNs whether or not to allow N's in the read * @return an array with randomized bases (A-N) with equal probability */ public static byte[] createRandomReadBases(int length, boolean allowNs) { Random random = GenomeAnalysisEngine.getRandomGenerator(); int numberOfBases = allowNs ? 5 : 4; byte[] bases = new byte[length]; for (int i = 0; i < length; i++) { switch (random.nextInt(numberOfBases)) { case 0: bases[i] = 'A'; break; case 1: bases[i] = 'C'; break; case 2: bases[i] = 'G'; break; case 3: bases[i] = 'T'; break; case 4: bases[i] = 'N'; break; default: throw new ReviewedStingException("Something went wrong, this is just impossible"); } } return bases; }
/** * Gets the appropriately formatted header for a VCF file describing this GATK run * * @param engine the GATK engine that holds the walker name, GATK version, and other information * @param argumentSources contains information on the argument values provided to the GATK for * converting to a command line string. Should be provided from the data in the parsing * engine. Can be empty in which case the command line will be the empty string. * @return VCF header line describing this run of the GATK. */ public static VCFHeaderLine getCommandLineArgumentHeaderLine( final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) { if (engine == null) throw new IllegalArgumentException("engine cannot be null"); if (argumentSources == null) throw new IllegalArgumentException("argumentSources cannot be null"); final Map<String, String> attributes = new LinkedHashMap<>(); attributes.put("ID", engine.getWalkerName()); attributes.put("Version", CommandLineGATK.getVersionNumber()); final Date date = new Date(); attributes.put("Date", date.toString()); attributes.put("Epoch", Long.toString(date.getTime())); attributes.put( "CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray())); return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes); }
// --------------------------------------------------------------------------------------------------------- // // Public interface functions // // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { this( toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY); }
private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); else { double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); double t = (1.0 / (rank - numRandom + 1)); if (v < t) { variantArray[positionToAdd].set(vc); nVariantsAdded++; positionToAdd = nextCircularPosition(positionToAdd); } } }
/** * Creates a program record (@PG) tag * * @param toolkit the engine * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag * @return a program record for the tool */ public static SAMProgramRecord createProgramRecord( GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); try { final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); programRecord.setProgramVersion(version); } catch (MissingResourceException e) { // couldn't care less if the resource is missing... } programRecord.setCommandLine( toolkit.createApproximateCommandLineArgumentString(toolkit, walker)); return programRecord; }
public static Map<String, VCFHeader> getVCFHeadersFromRodPrefix( GenomeAnalysisEngine toolkit, String prefix) { Map<String, VCFHeader> data = new HashMap<String, VCFHeader>(); // iterate to get all of the sample names List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources(); for (ReferenceOrderedDataSource source : dataSources) { // ignore the rod if lacks the prefix if (!source.getName().startsWith(prefix)) continue; if (source.getHeader() != null && source.getHeader() instanceof VCFHeader) data.put(source.getName(), (VCFHeader) source.getHeader()); } return data; }
public static Map<String, VCFHeader> getVCFHeadersFromRods( GenomeAnalysisEngine toolkit, Collection<String> rodNames) { Map<String, VCFHeader> data = new HashMap<String, VCFHeader>(); // iterate to get all of the sample names List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources(); for (ReferenceOrderedDataSource source : dataSources) { // ignore the rod if it's not in our list if (rodNames != null && !rodNames.contains(source.getName())) continue; if (source.getHeader() != null && source.getHeader() instanceof VCFHeader) data.put(source.getName(), (VCFHeader) source.getHeader()); } return data; }
@Test(enabled = true) public void testGetBasesReverseComplement() { int iterations = 1000; Random random = GenomeAnalysisEngine.getRandomGenerator(); while (iterations-- > 0) { final int l = random.nextInt(1000); GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); byte[] original = read.getReadBases(); byte[] reconverted = new byte[l]; String revComp = ReadUtils.getBasesReverseComplement(read); for (int i = 0; i < l; i++) { reconverted[l - 1 - i] = BaseUtils.getComplement((byte) revComp.charAt(i)); } Assert.assertEquals(reconverted, original); } }
private void addWriter( SAMReaderID id, String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); SAMFileHeader header = Utils.setupWriter( toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); writerMap.put(id, sw); }
private void testPerSampleEqualsFlat( final String bam1, final String bam2, final String persampleFile, final Double downsampling, final String md5) { final String command = baseCommand3 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(command + " -contaminationFile " + persampleFile, 1, Arrays.asList(md5)); final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result executeTest( "test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); spec = new WalkerTestSpec( command + "-contamination " + downsampling.toString(), 1, Arrays.asList(md5)); rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result executeTest( "test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); }
/** * Gets the header fields from all VCF rods input by the user * * @param toolkit GATK engine * @param rodNames names of rods to use, or null if we should use all possible ones * @return a set of all fields */ public static Set<VCFHeaderLine> getHeaderFields( GenomeAnalysisEngine toolkit, Collection<String> rodNames) { // keep a map of sample name to occurrences encountered TreeSet<VCFHeaderLine> fields = new TreeSet<VCFHeaderLine>(); // iterate to get all of the sample names List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources(); for (ReferenceOrderedDataSource source : dataSources) { // ignore the rod if it's not in our list if (rodNames != null && !rodNames.contains(source.getName())) continue; if (source.getRecordType().equals(VariantContext.class)) { VCFHeader header = (VCFHeader) source.getHeader(); if (header != null) fields.addAll(header.getMetaDataInSortedOrder()); } } return fields; }
/** * Create random read qualities * * @param length the length of the read * @return an array with randomized base qualities between 0 and 50 */ public static byte[] createRandomReadQuals(int length) { Random random = GenomeAnalysisEngine.getRandomGenerator(); byte[] quals = new byte[length]; for (int i = 0; i < length; i++) quals[i] = (byte) random.nextInt(50); return quals; }
public SAMFileHeader getFileHeader() { return toolkit.getSAMFileHeader(); }
/** * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine * * @param header the header to update * @param engine the GATK engine containing command line arguments and the master sequence * dictionary */ public static VCFHeader withUpdatedContigs( final VCFHeader header, final GenomeAnalysisEngine engine) { return VCFUtils.withUpdatedContigs( header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); }
/** * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for * printing) * * @param tracker the ROD tracker * @param ref reference information * @param context alignment info * @return 1 if the record was printed to the output file, 0 if otherwise */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker == null) return 0; Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation()); if (vcs == null || vcs.size() == 0) { return 0; } for (VariantContext vc : vcs) { if (MENDELIAN_VIOLATIONS) { boolean foundMV = false; for (MendelianViolation mv : mvSet) { if (mv.isViolation(vc)) { foundMV = true; // System.out.println(vc.toString()); if (outMVFile != null) outMVFileStream.format( "MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n", vc.getChr(), vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleChild()).toBriefString(), vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString()); } } if (!foundMV) break; } if (DISCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) return 0; } if (CONCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) return 0; } if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) continue; if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); if ((sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED)) { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { if (!VariantContextUtils.match(sub, jexl)) { return 0; } } if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } else if (!SELECT_RANDOM_FRACTION || (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } } } return 1; }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }