public static int getMeanRepresentativeReadCount(GATKSAMRecord read) { if (!read.isReducedRead()) return 1; // compute mean representative read counts final byte[] counts = read.getReducedReadCounts(); return (int) Math.round((double) MathUtils.sum(counts) / counts.length); }
private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>(); final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); if (indelLikelihoodMap == null) return null; for (final PileupElement p : pileup) { if (indelLikelihoodMap.containsKey(p)) { // retrieve likelihood information corresponding to this read LinkedHashMap<Allele, Double> el = indelLikelihoodMap.get(p); // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[el.size()]; int i = 0; for (Allele a : el.keySet()) { scores[i++] = -el.get(a); if (DEBUG) { System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); } } haplotypeScores.add(scores); } } // indel likelihoods are strict log-probs, not phred scored double overallScore = 0.0; for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; }
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes private double scoreReadsAgainstHaplotypes( final List<Haplotype> haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) { if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0)); if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1)); final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>(); for (final PileupElement p : pileup) { // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[haplotypes.size()]; for (int i = 0; i < haplotypes.size(); i++) { final Haplotype haplotype = haplotypes.get(i); final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus); scores[i] = score; if (DEBUG) { System.out.printf(" vs. haplotype %d = %f%n", i, score); } } haplotypeScores.add(scores); } double overallScore = 0.0; for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; }
/** * Returns the coverage distribution of a list of reads within the desired region. * * <p>See getCoverageDistributionOfRead for information on how the coverage is calculated. * * @param list the list of reads covering the region * @param startLocation the first reference coordinate of the region (inclusive) * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int[] getCoverageDistributionOfReads( List<GATKSAMRecord> list, int startLocation, int stopLocation) { int[] totalCoverage = new int[stopLocation - startLocation + 1]; for (GATKSAMRecord read : list) { int[] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); } return totalCoverage; }
private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods) { if (VQSRCalibrator != null) { log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); } double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); // see if we need to randomly mask out genotype in this position. for (double likeVal : normalizedLikelihoods) { out.append(formatter.format(likeVal)); // out.append(String.format("%5.4f ",likeVal)); } }
private Map<String, Object> calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); if (genotypes == null || genotypes.size() < MIN_SAMPLES) return null; int idxAA = 0, idxAB = 1, idxBB = 2; if (!vc.isBiallelic()) { // for non-bliallelic case, do test with most common alt allele. // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); idxAA = idxVector[0]; idxAB = idxVector[1]; idxBB = idxVector[2]; } double refCount = 0.0; double hetCount = 0.0; double homCount = 0.0; int N = 0; // number of samples that have likelihoods for (final Genotype g : genotypes) { if (g.isNoCall() || !g.hasLikelihoods()) continue; if (g.getPloidy() != 2) // only work for diploid samples continue; N++; final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); refCount += normalizedLikelihoods[idxAA]; hetCount += normalizedLikelihoods[idxAB]; homCount += normalizedLikelihoods[idxBB]; } if (N < MIN_SAMPLES) { return null; } final double p = (2.0 * refCount + hetCount) / (2.0 * (refCount + hetCount + homCount)); // expected reference allele frequency final double q = 1.0 - p; // expected alternative allele frequency final double F = 1.0 - (hetCount / (2.0 * p * q * (double) N)); // inbreeding coefficient Map<String, Object> map = new HashMap<String, Object>(); map.put(getKeyNames().get(0), String.format("%.4f", F)); return map; }
// Private function to compare 2d arrays private boolean compareDoubleArrays(double[][] b1, double[][] b2) { if (b1.length != b2.length) { return false; // sanity check } for (int i = 0; i < b1.length; i++) { if (b1[i].length != b2[i].length) { return false; // sanity check } for (int j = 0; j < b1.length; j++) { if (MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j])) return false; } } return true; }
/** * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep * or all sites (if this is empty) * * @param reader a just-opened reader sitting at the start of the file * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should * be kept * @param parser a genome loc parser to create genome locs * @return a list of ExactCall objects in reader * @throws IOException */ public static List<ExactCall> readExactLog( final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException { if (reader == null) throw new IllegalArgumentException("reader cannot be null"); if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null"); if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null"); List<ExactCall> calls = new LinkedList<ExactCall>(); // skip the header line reader.readLine(); // skip the first "type" line reader.readLine(); while (true) { final VariantContextBuilder builder = new VariantContextBuilder(); final List<Allele> alleles = new ArrayList<Allele>(); final List<Genotype> genotypes = new ArrayList<Genotype>(); final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true); final List<Integer> mle = new ArrayList<Integer>(); final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(); long runtimeNano = -1; GenomeLoc currentLoc = null; while (true) { final String line = reader.readLine(); if (line == null) return calls; final String[] parts = line.split("\t"); final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); final String variable = parts[1]; final String key = parts[2]; final String value = parts[3]; if (currentLoc == null) currentLoc = lineLoc; if (variable.equals("type")) { if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { builder.alleles(alleles); final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {})); final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; } else if (variable.equals("allele")) { final boolean isRef = key.equals("0"); alleles.add(Allele.create(value, isRef)); } else if (variable.equals("PL")) { final GenotypeBuilder gb = new GenotypeBuilder(key); gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); genotypes.add(gb.make()); } else if (variable.equals("log10PosteriorOfAFEq0")) { posteriors[0] = Double.valueOf(value); } else if (variable.equals("log10PosteriorOfAFGt0")) { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); } else if (variable.equals("pNonRefByAllele")) { final Allele a = Allele.create(key); log10pNonRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { // nothing to do } } } }
/** * Converts the input VCF into a format accepted by the Beagle imputation/analysis program. * * <p> * * <h2>Input</h2> * * <p>A VCF with variants to convert to Beagle format * * <h2>Outputs</h2> * * <p>A single text file which can be fed to Beagle * * <p>Optional: A file with a list of markers * * <h2>Examples</h2> * * <pre> * java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \ * -R reffile.fasta -T ProduceBeagleInput \ * -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output * </pre> */ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Hidden @Input( fullName = "validation", shortName = "validation", doc = "Validation VCF file", required = false) public RodBinding<VariantContext> validation; @Output(doc = "File to which BEAGLE input should be written", required = true) protected PrintStream beagleWriter = null; @Hidden @Output( doc = "File to which BEAGLE markers should be written", shortName = "markers", fullName = "markers", required = false) protected PrintStream markers = null; int markerCounter = 1; @Hidden @Input(doc = "VQSqual calibration file", shortName = "cc", required = false) protected File VQSRCalibrationFile = null; protected VQSRCalibrationCurve VQSRCalibrator = null; @Hidden @Argument(doc = "VQSqual key", shortName = "vqskey", required = false) protected String VQSLOD_KEY = "VQSqual"; @Hidden @Argument( fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; @Hidden @Argument( fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) public double validationPrior = -1.0; @Hidden @Argument( fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) public double bootstrap = 0.0; @Hidden @Argument( fullName = "bootstrap_vcf", shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; /** * If sample gender is known, this flag should be set to true to ensure that Beagle treats male * Chr X properly. */ @Argument( fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; @Hidden @Argument( fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false) public double variantPrior = 0.96; private Set<String> samples = null; private Set<String> BOOTSTRAP_FILTER = new HashSet<String>(Arrays.asList("bootstrap")); private int bootstrapSetSize = 0; private int testSetSize = 0; private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000); private int certainFPs = 0; public void initialize() { samples = SampleUtils.getSampleListWithVCFHeader( getToolkit(), Arrays.asList(variantCollection.variants.getName())); beagleWriter.print("marker alleleA alleleB"); for (String sample : samples) beagleWriter.print(String.format(" %s %s %s", sample, sample, sample)); beagleWriter.println(); if (bootstrapVCFOutput != null) { initializeVcfWriter(); } if (VQSRCalibrationFile != null) { VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile); logger.info("Read calibration curve"); VQSRCalibrator.printInfo(logger); } } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker != null) { GenomeLoc loc = context.getLocation(); VariantContext variant_eval = tracker.getFirstValue(variantCollection.variants, loc); VariantContext validation_eval = tracker.getFirstValue(validation, loc); if (goodSite(variant_eval, validation_eval)) { if (useValidation(validation_eval, ref)) { writeBeagleOutput(validation_eval, variant_eval, true, validationPrior); return 1; } else { if (goodSite(variant_eval)) { writeBeagleOutput(variant_eval, validation_eval, false, variantPrior); return 1; } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap // set -- what do? return 0; } } } else { return 0; } } else { return 0; } } public boolean goodSite(VariantContext a, VariantContext b) { return goodSite(a) || goodSite(b); } public boolean goodSite(VariantContext v) { if (canBeOutputToBeagle(v)) { if (VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v)) { certainFPs++; return false; } else { return true; } } else { return false; } } public static boolean canBeOutputToBeagle(VariantContext v) { return v != null && !v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); } public boolean useValidation(VariantContext validation, ReferenceContext ref) { if (goodSite(validation)) { // if using record keeps us below expected proportion, use it logger.debug( String.format( "boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize + testSetSize + 1)); if ((bootstrapSetSize + 1.0) / (1.0 + bootstrapSetSize + testSetSize) <= bootstrap) { if (bootstrapVCFOutput != null) { bootstrapVCFOutput.add( new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make()); } bootstrapSetSize++; return true; } else { if (bootstrapVCFOutput != null) { bootstrapVCFOutput.add(validation); } testSetSize++; return false; } } else { if (validation != null && bootstrapVCFOutput != null) { bootstrapVCFOutput.add(validation); } return false; } } private static final double[] HAPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[] {0.5, 0.0, 0.5}); private static final double[] DIPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[] {0.33, 0.33, 0.33}); public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); } private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods) { if (VQSRCalibrator != null) { log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); } double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); // see if we need to randomly mask out genotype in this position. for (double likeVal : normalizedLikelihoods) { out.append(formatter.format(likeVal)); // out.append(String.format("%5.4f ",likeVal)); } } public Integer reduceInit() { return 0; // Nothing to do here } public Integer reduce(Integer value, Integer sum) { return value + sum; // count up the sites } public void onTraversalDone(Integer includedSites) { logger.info("Sites included in beagle likelihoods file : " + includedSites); logger.info( String.format( "Certain false positive found from recalibration curve : %d (%.2f%%)", certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1)))); } private void initializeVcfWriter() { final List<String> inputNames = Arrays.asList(validation.getName()); // setup the header fields Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); hInfo.add( new VCFFilterHeaderLine( "bootstrap", "This site used for genotype bootstrapping with ProduceBeagleInputWalker")); bootstrapVCFOutput.writeHeader( new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); } public static class CachingFormatter { private int maxCacheSize = 0; private String format; private LRUCache<Double, String> cache; public String getFormat() { return format; } public String format(double value) { String f = cache.get(value); if (f == null) { f = String.format(format, value); cache.put(value, f); // if ( cache.usedEntries() < maxCacheSize ) { // System.out.printf("CACHE size %d%n", cache.usedEntries()); // } else { // System.out.printf("CACHE is full %f%n", value); // } // } // } else { // System.out.printf("CACHE hit %f%n", value); // } } return f; } public CachingFormatter(String format, int maxCacheSize) { this.maxCacheSize = maxCacheSize; this.format = format; this.cache = new LRUCache<Double, String>(maxCacheSize); } } /** * An LRU cache, based on <code>LinkedHashMap</code>. * * <p>This cache has a fixed maximum number of elements (<code>cacheSize</code>). If the cache is * full and another entry is added, the LRU (least recently used) entry is dropped. * * <p>This class is thread-safe. All methods of this class are synchronized. * * <p>Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br> * Multi-licensed: EPL / LGPL / GPL / AL / BSD. */ public static class LRUCache<K, V> { private static final float hashTableLoadFactor = 0.75f; private LinkedHashMap<K, V> map; private int cacheSize; /** * Creates a new LRU cache. * * @param cacheSize the maximum number of entries that will be kept in this cache. */ public LRUCache(int cacheSize) { this.cacheSize = cacheSize; int hashTableCapacity = (int) Math.ceil(cacheSize / hashTableLoadFactor) + 1; map = new LinkedHashMap<K, V>(hashTableCapacity, hashTableLoadFactor, true) { // (an anonymous inner class) private static final long serialVersionUID = 1; @Override protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { return size() > LRUCache.this.cacheSize; } }; } /** * Retrieves an entry from the cache.<br> * The retrieved entry becomes the MRU (most recently used) entry. * * @param key the key whose associated value is to be returned. * @return the value associated to this key, or null if no value with this key exists in the * cache. */ public synchronized V get(K key) { return map.get(key); } /** * Adds an entry to this cache. The new entry becomes the MRU (most recently used) entry. If an * entry with the specified key already exists in the cache, it is replaced by the new entry. If * the cache is full, the LRU (least recently used) entry is removed from the cache. * * @param key the key with which the specified value is to be associated. * @param value a value to be associated with the specified key. */ public synchronized void put(K key, V value) { map.put(key, value); } /** Clears the cache. */ public synchronized void clear() { map.clear(); } /** * Returns the number of used entries in the cache. * * @return the number of entries currently in the cache. */ public synchronized int usedEntries() { return map.size(); } /** * Returns a <code>Collection</code> that contains a copy of all cache entries. * * @return a <code>Collection</code> with a copy of the cache content. */ public synchronized Collection<Map.Entry<K, V>> getAll() { return new ArrayList<Map.Entry<K, V>>(map.entrySet()); } } // end class LRUCache }
public void writeBeagleOutput( VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC); StringBuffer beagleOut = new StringBuffer(); String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart()); beagleOut.append(marker); if (markers != null) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t"); for (Allele allele : preferredVC.getAlleles()) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) bglPrintString = "-"; else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele beagleOut.append(String.format("%s ", bglPrintString)); if (markers != null) markers.append(bglPrintString).append("\t"); } if (markers != null) markers.append("\n"); GenotypesContext preferredGenotypes = preferredVC.getGenotypes(); GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for (String sample : samples) { boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; // use sample as key into genotypes structure if (preferredGenotypes.containsSample(sample)) { genotype = preferredGenotypes.get(sample); isValidation = isValidationSite; } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) { genotype = otherGenotypes.get(sample); isValidation = !isValidationSite; } else { // there is magically no genotype for this sample. throw new StingException( "Sample " + sample + " arose with no genotype in variant or validation VCF. This should never happen."); } /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ double[] log10Likelihoods = null; if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } if (isMaleOnChrX) { log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** otherwise, use the prior uniformly */ else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called // genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0 - prior) / 2.0; double AB = (1.0 - prior) / 2.0; double BB = (1.0 - prior) / 2.0; if (genotype.isHomRef()) { AA = prior; } else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB}); } else { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); }
public final List<Sample> parse( Reader reader, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) { final List<String> lines = new XReadLines(reader).readLines(); // What are the record offsets? final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0; final int samplePos = familyPos + 1; final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1; final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1; final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; final int nExpectedFields = MathUtils.arrayMaxInt( Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; // go through once and determine properties int lineNo = 1; boolean isQT = false; final List<String[]> splits = new ArrayList<String[]>(lines.size()); for (final String line : lines) { if (line.startsWith(commentMarker)) continue; if (line.trim().equals("")) continue; final String[] parts = line.split("\\s+"); if (parts.length != nExpectedFields) throw new UserException.MalformedFile( reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); if (phenotypePos != -1) { isQT = isQT || !CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); } splits.add(parts); lineNo++; } logger.info("Phenotype is other? " + isQT); // now go through and parse each record lineNo = 1; final List<Sample> samples = new ArrayList<Sample>(splits.size()); for (final String[] parts : splits) { String familyID = null, individualID, paternalID = null, maternalID = null; Gender sex = Gender.UNKNOWN; String quantitativePhenotype = Sample.UNSET_QT; Affection affection = Affection.UNKNOWN; if (familyPos != -1) familyID = maybeMissing(parts[familyPos]); individualID = parts[samplePos]; if (paternalPos != -1) paternalID = maybeMissing(parts[paternalPos]); if (maternalPos != -1) maternalID = maybeMissing(parts[maternalPos]); if (sexPos != -1) { if (parts[sexPos].equals(SEX_MALE)) sex = Gender.MALE; else if (parts[sexPos].equals(SEX_FEMALE)) sex = Gender.FEMALE; else sex = Gender.UNKNOWN; } if (phenotypePos != -1) { if (isQT) { if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN; else { affection = Affection.OTHER; quantitativePhenotype = parts[phenotypePos]; } } else { if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN; else if (parts[phenotypePos].equals(MISSING_VALUE2)) affection = Affection.UNKNOWN; else if (parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED)) affection = Affection.UNAFFECTED; else if (parts[phenotypePos].equals(PHENOTYPE_AFFECTED)) affection = Affection.AFFECTED; else throw new ReviewedStingException( "Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); } } final Sample s = new Sample( individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype); samples.add(s); sampleDB.addSample(s); lineNo++; } for (final Sample sample : new ArrayList<Sample>(samples)) { Sample dad = maybeAddImplicitSample( sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE); if (dad != null) samples.add(dad); Sample mom = maybeAddImplicitSample( sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE); if (mom != null) samples.add(mom); } return samples; }
private final double getRefBinomialProb(final int depth) { if (depth < binomialProbabilityDepthCache.length) return binomialProbabilityDepthCache[depth]; else return MathUtils.binomialProbability(0, depth, 0.5); }
static { for (int i = 1; i < binomialProbabilityDepthCache.length; i++) { binomialProbabilityDepthCache[i] = MathUtils.binomialProbability(0, i, 0.5); } }