Example #1
0
  public static int getMeanRepresentativeReadCount(GATKSAMRecord read) {
    if (!read.isReducedRead()) return 1;

    // compute mean representative read counts
    final byte[] counts = read.getReducedReadCounts();
    return (int) Math.round((double) MathUtils.sum(counts) / counts.length);
  }
Example #2
0
  private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) {
    final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();

    final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap =
        IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();

    if (indelLikelihoodMap == null) return null;

    for (final PileupElement p : pileup) {
      if (indelLikelihoodMap.containsKey(p)) {
        // retrieve likelihood information corresponding to this read
        LinkedHashMap<Allele, Double> el = indelLikelihoodMap.get(p);

        // Score all the reads in the pileup, even the filtered ones
        final double[] scores = new double[el.size()];
        int i = 0;
        for (Allele a : el.keySet()) {
          scores[i++] = -el.get(a);
          if (DEBUG) {
            System.out.printf("  vs. haplotype %d = %f%n", i - 1, scores[i - 1]);
          }
        }

        haplotypeScores.add(scores);
      }
    }

    // indel likelihoods are strict log-probs, not phred scored
    double overallScore = 0.0;
    for (final double[] readHaplotypeScores : haplotypeScores) {
      overallScore += MathUtils.arrayMin(readHaplotypeScores);
    }

    return overallScore;
  }
Example #3
0
  // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
  private double scoreReadsAgainstHaplotypes(
      final List<Haplotype> haplotypes,
      final ReadBackedPileup pileup,
      final int contextSize,
      final int locus) {
    if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0));
    if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1));

    final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
    for (final PileupElement p : pileup) {
      // Score all the reads in the pileup, even the filtered ones
      final double[] scores = new double[haplotypes.size()];
      for (int i = 0; i < haplotypes.size(); i++) {
        final Haplotype haplotype = haplotypes.get(i);
        final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus);
        scores[i] = score;
        if (DEBUG) {
          System.out.printf("  vs. haplotype %d = %f%n", i, score);
        }
      }
      haplotypeScores.add(scores);
    }

    double overallScore = 0.0;
    for (final double[] readHaplotypeScores : haplotypeScores) {
      overallScore += MathUtils.arrayMin(readHaplotypeScores);
    }

    return overallScore;
  }
Example #4
0
  /**
   * Returns the coverage distribution of a list of reads within the desired region.
   *
   * <p>See getCoverageDistributionOfRead for information on how the coverage is calculated.
   *
   * @param list the list of reads covering the region
   * @param startLocation the first reference coordinate of the region (inclusive)
   * @param stopLocation the last reference coordinate of the region (inclusive)
   * @return an array with the coverage of each position from startLocation to stopLocation
   */
  public static int[] getCoverageDistributionOfReads(
      List<GATKSAMRecord> list, int startLocation, int stopLocation) {
    int[] totalCoverage = new int[stopLocation - startLocation + 1];

    for (GATKSAMRecord read : list) {
      int[] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
      totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage);
    }

    return totalCoverage;
  }
Example #5
0
  private void writeSampleLikelihoods(
      StringBuffer out, VariantContext vc, double[] log10Likelihoods) {
    if (VQSRCalibrator != null) {
      log10Likelihoods =
          VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
    }

    double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
    // see if we need to randomly mask out genotype in this position.
    for (double likeVal : normalizedLikelihoods) {
      out.append(formatter.format(likeVal));
      //            out.append(String.format("%5.4f ",likeVal));
    }
  }
Example #6
0
  private Map<String, Object> calculateIC(final VariantContext vc) {
    final GenotypesContext genotypes =
        (founderIds == null || founderIds.isEmpty())
            ? vc.getGenotypes()
            : vc.getGenotypes(founderIds);
    if (genotypes == null || genotypes.size() < MIN_SAMPLES) return null;

    int idxAA = 0, idxAB = 1, idxBB = 2;

    if (!vc.isBiallelic()) {
      // for non-bliallelic case, do test with most common alt allele.
      // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
      int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
      idxAA = idxVector[0];
      idxAB = idxVector[1];
      idxBB = idxVector[2];
    }

    double refCount = 0.0;
    double hetCount = 0.0;
    double homCount = 0.0;
    int N = 0; // number of samples that have likelihoods
    for (final Genotype g : genotypes) {
      if (g.isNoCall() || !g.hasLikelihoods()) continue;

      if (g.getPloidy() != 2) // only work for diploid samples
      continue;

      N++;
      final double[] normalizedLikelihoods =
          MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector());
      refCount += normalizedLikelihoods[idxAA];
      hetCount += normalizedLikelihoods[idxAB];
      homCount += normalizedLikelihoods[idxBB];
    }

    if (N < MIN_SAMPLES) {
      return null;
    }

    final double p =
        (2.0 * refCount + hetCount)
            / (2.0 * (refCount + hetCount + homCount)); // expected reference allele frequency
    final double q = 1.0 - p; // expected alternative allele frequency
    final double F = 1.0 - (hetCount / (2.0 * p * q * (double) N)); // inbreeding coefficient

    Map<String, Object> map = new HashMap<String, Object>();
    map.put(getKeyNames().get(0), String.format("%.4f", F));
    return map;
  }
  // Private function to compare 2d arrays
  private boolean compareDoubleArrays(double[][] b1, double[][] b2) {
    if (b1.length != b2.length) {
      return false; // sanity check
    }

    for (int i = 0; i < b1.length; i++) {
      if (b1[i].length != b2[i].length) {
        return false; // sanity check
      }
      for (int j = 0; j < b1.length; j++) {
        if (MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0
            && !Double.isInfinite(b1[i][j])
            && !Double.isInfinite(b2[i][j])) return false;
      }
    }
    return true;
  }
Example #8
0
  /**
   * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep
   * or all sites (if this is empty)
   *
   * @param reader a just-opened reader sitting at the start of the file
   * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should
   *     be kept
   * @param parser a genome loc parser to create genome locs
   * @return a list of ExactCall objects in reader
   * @throws IOException
   */
  public static List<ExactCall> readExactLog(
      final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser)
      throws IOException {
    if (reader == null) throw new IllegalArgumentException("reader cannot be null");
    if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null");
    if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null");

    List<ExactCall> calls = new LinkedList<ExactCall>();

    // skip the header line
    reader.readLine();

    // skip the first "type" line
    reader.readLine();

    while (true) {
      final VariantContextBuilder builder = new VariantContextBuilder();
      final List<Allele> alleles = new ArrayList<Allele>();
      final List<Genotype> genotypes = new ArrayList<Genotype>();
      final double[] posteriors = new double[2];
      final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true);
      final List<Integer> mle = new ArrayList<Integer>();
      final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>();
      long runtimeNano = -1;

      GenomeLoc currentLoc = null;
      while (true) {
        final String line = reader.readLine();
        if (line == null) return calls;

        final String[] parts = line.split("\t");
        final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]);
        final String variable = parts[1];
        final String key = parts[2];
        final String value = parts[3];

        if (currentLoc == null) currentLoc = lineLoc;

        if (variable.equals("type")) {
          if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) {
            builder.alleles(alleles);
            final int stop = currentLoc.getStart() + alleles.get(0).length() - 1;
            builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
            builder.genotypes(genotypes);
            final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {}));
            final AFCalcResult result =
                new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele);
            calls.add(new ExactCall(builder.make(), runtimeNano, result));
          }
          break;
        } else if (variable.equals("allele")) {
          final boolean isRef = key.equals("0");
          alleles.add(Allele.create(value, isRef));
        } else if (variable.equals("PL")) {
          final GenotypeBuilder gb = new GenotypeBuilder(key);
          gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs());
          genotypes.add(gb.make());
        } else if (variable.equals("log10PosteriorOfAFEq0")) {
          posteriors[0] = Double.valueOf(value);
        } else if (variable.equals("log10PosteriorOfAFGt0")) {
          posteriors[1] = Double.valueOf(value);
        } else if (variable.equals("MLE")) {
          mle.add(Integer.valueOf(value));
        } else if (variable.equals("pNonRefByAllele")) {
          final Allele a = Allele.create(key);
          log10pNonRefByAllele.put(a, Double.valueOf(value));
        } else if (variable.equals("runtime.nano")) {
          runtimeNano = Long.valueOf(value);
        } else {
          // nothing to do
        }
      }
    }
  }
Example #9
0
/**
 * Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
 *
 * <p>
 *
 * <h2>Input</h2>
 *
 * <p>A VCF with variants to convert to Beagle format
 *
 * <h2>Outputs</h2>
 *
 * <p>A single text file which can be fed to Beagle
 *
 * <p>Optional: A file with a list of markers
 *
 * <h2>Examples</h2>
 *
 * <pre>
 *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
 *      -R reffile.fasta -T ProduceBeagleInput \
 *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
 * </pre>
 */
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {

  @ArgumentCollection
  protected StandardVariantContextInputArgumentCollection variantCollection =
      new StandardVariantContextInputArgumentCollection();

  @Hidden
  @Input(
      fullName = "validation",
      shortName = "validation",
      doc = "Validation VCF file",
      required = false)
  public RodBinding<VariantContext> validation;

  @Output(doc = "File to which BEAGLE input should be written", required = true)
  protected PrintStream beagleWriter = null;

  @Hidden
  @Output(
      doc = "File to which BEAGLE markers should be written",
      shortName = "markers",
      fullName = "markers",
      required = false)
  protected PrintStream markers = null;

  int markerCounter = 1;

  @Hidden
  @Input(doc = "VQSqual calibration file", shortName = "cc", required = false)
  protected File VQSRCalibrationFile = null;

  protected VQSRCalibrationCurve VQSRCalibrator = null;

  @Hidden
  @Argument(doc = "VQSqual key", shortName = "vqskey", required = false)
  protected String VQSLOD_KEY = "VQSqual";

  @Hidden
  @Argument(
      fullName = "inserted_nocall_rate",
      shortName = "nc_rate",
      doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing",
      required = false)
  public double insertedNoCallRate = 0;

  @Hidden
  @Argument(
      fullName = "validation_genotype_ptrue",
      shortName = "valp",
      doc = "Flat probability to assign to validation genotypes. Will override GL field.",
      required = false)
  public double validationPrior = -1.0;

  @Hidden
  @Argument(
      fullName = "validation_bootstrap",
      shortName = "bs",
      doc = "Proportion of records to be used in bootstrap set",
      required = false)
  public double bootstrap = 0.0;

  @Hidden
  @Argument(
      fullName = "bootstrap_vcf",
      shortName = "bvcf",
      doc = "Output a VCF with the records used for bootstrapping filtered out",
      required = false)
  VCFWriter bootstrapVCFOutput = null;

  /**
   * If sample gender is known, this flag should be set to true to ensure that Beagle treats male
   * Chr X properly.
   */
  @Argument(
      fullName = "checkIsMaleOnChrX",
      shortName = "checkIsMaleOnChrX",
      doc =
          "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.",
      required = false)
  public boolean CHECK_IS_MALE_ON_CHR_X = false;

  @Hidden
  @Argument(
      fullName = "variant_genotype_ptrue",
      shortName = "varp",
      doc =
          "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.",
      required = false)
  public double variantPrior = 0.96;

  private Set<String> samples = null;
  private Set<String> BOOTSTRAP_FILTER = new HashSet<String>(Arrays.asList("bootstrap"));
  private int bootstrapSetSize = 0;
  private int testSetSize = 0;
  private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000);
  private int certainFPs = 0;

  public void initialize() {

    samples =
        SampleUtils.getSampleListWithVCFHeader(
            getToolkit(), Arrays.asList(variantCollection.variants.getName()));

    beagleWriter.print("marker alleleA alleleB");
    for (String sample : samples)
      beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));

    beagleWriter.println();

    if (bootstrapVCFOutput != null) {
      initializeVcfWriter();
    }

    if (VQSRCalibrationFile != null) {
      VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
      logger.info("Read calibration curve");
      VQSRCalibrator.printInfo(logger);
    }
  }

  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    if (tracker != null) {
      GenomeLoc loc = context.getLocation();
      VariantContext variant_eval = tracker.getFirstValue(variantCollection.variants, loc);
      VariantContext validation_eval = tracker.getFirstValue(validation, loc);

      if (goodSite(variant_eval, validation_eval)) {
        if (useValidation(validation_eval, ref)) {
          writeBeagleOutput(validation_eval, variant_eval, true, validationPrior);
          return 1;
        } else {
          if (goodSite(variant_eval)) {
            writeBeagleOutput(variant_eval, validation_eval, false, variantPrior);
            return 1;
          } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap
            // set -- what do?
            return 0;
          }
        }
      } else {
        return 0;
      }
    } else {
      return 0;
    }
  }

  public boolean goodSite(VariantContext a, VariantContext b) {
    return goodSite(a) || goodSite(b);
  }

  public boolean goodSite(VariantContext v) {
    if (canBeOutputToBeagle(v)) {
      if (VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v)) {
        certainFPs++;
        return false;
      } else {
        return true;
      }
    } else {
      return false;
    }
  }

  public static boolean canBeOutputToBeagle(VariantContext v) {
    return v != null && !v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
  }

  public boolean useValidation(VariantContext validation, ReferenceContext ref) {
    if (goodSite(validation)) {
      // if using record keeps us below expected proportion, use it
      logger.debug(
          String.format(
              "boot: %d, test: %d, total: %d",
              bootstrapSetSize, testSetSize, bootstrapSetSize + testSetSize + 1));
      if ((bootstrapSetSize + 1.0) / (1.0 + bootstrapSetSize + testSetSize) <= bootstrap) {
        if (bootstrapVCFOutput != null) {
          bootstrapVCFOutput.add(
              new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
        }
        bootstrapSetSize++;
        return true;
      } else {
        if (bootstrapVCFOutput != null) {
          bootstrapVCFOutput.add(validation);
        }
        testSetSize++;
        return false;
      }
    } else {
      if (validation != null && bootstrapVCFOutput != null) {
        bootstrapVCFOutput.add(validation);
      }
      return false;
    }
  }

  private static final double[] HAPLOID_FLAT_LOG10_LIKELIHOODS =
      MathUtils.toLog10(new double[] {0.5, 0.0, 0.5});
  private static final double[] DIPLOID_FLAT_LOG10_LIKELIHOODS =
      MathUtils.toLog10(new double[] {0.33, 0.33, 0.33});

  public void writeBeagleOutput(
      VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
    GenomeLoc currentLoc =
        VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
    StringBuffer beagleOut = new StringBuffer();

    String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart());
    beagleOut.append(marker);
    if (markers != null)
      markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
    for (Allele allele : preferredVC.getAlleles()) {
      String bglPrintString;
      if (allele.isNoCall() || allele.isNull()) bglPrintString = "-";
      else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele

      beagleOut.append(String.format("%s ", bglPrintString));
      if (markers != null) markers.append(bglPrintString).append("\t");
    }
    if (markers != null) markers.append("\n");

    GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
    GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
    for (String sample : samples) {
      boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

      Genotype genotype;
      boolean isValidation;
      // use sample as key into genotypes structure
      if (preferredGenotypes.containsSample(sample)) {
        genotype = preferredGenotypes.get(sample);
        isValidation = isValidationSite;
      } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) {
        genotype = otherGenotypes.get(sample);
        isValidation = !isValidationSite;
      } else {
        // there is magically no genotype for this sample.
        throw new StingException(
            "Sample "
                + sample
                + " arose with no genotype in variant or validation VCF. This should never happen.");
      }

      /*
       * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
       */
      double[] log10Likelihoods = null;
      if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) {
        log10Likelihoods = genotype.getLikelihoods().getAsVector();

        // see if we need to randomly mask out genotype in this position.
        if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) {
          // we are masking out this genotype
          log10Likelihoods =
              isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
        }

        if (isMaleOnChrX) {
          log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
        }
      }
      /** otherwise, use the prior uniformly */
      else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) {
        // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called
        // genotype
        // is confident.  This is useful for Hapmap and 1KG release VCFs.
        double AA = (1.0 - prior) / 2.0;
        double AB = (1.0 - prior) / 2.0;
        double BB = (1.0 - prior) / 2.0;

        if (genotype.isHomRef()) {
          AA = prior;
        } else if (genotype.isHet()) {
          AB = prior;
        } else if (genotype.isHomVar()) {
          BB = prior;
        }

        log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB});
      } else {
        log10Likelihoods =
            isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
      }

      writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
    }

    beagleWriter.println(beagleOut.toString());
  }

  private void writeSampleLikelihoods(
      StringBuffer out, VariantContext vc, double[] log10Likelihoods) {
    if (VQSRCalibrator != null) {
      log10Likelihoods =
          VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
    }

    double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
    // see if we need to randomly mask out genotype in this position.
    for (double likeVal : normalizedLikelihoods) {
      out.append(formatter.format(likeVal));
      //            out.append(String.format("%5.4f ",likeVal));
    }
  }

  public Integer reduceInit() {
    return 0; // Nothing to do here
  }

  public Integer reduce(Integer value, Integer sum) {
    return value + sum; // count up the sites
  }

  public void onTraversalDone(Integer includedSites) {
    logger.info("Sites included in beagle likelihoods file             : " + includedSites);
    logger.info(
        String.format(
            "Certain false positive found from recalibration curve : %d (%.2f%%)",
            certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1))));
  }

  private void initializeVcfWriter() {
    final List<String> inputNames = Arrays.asList(validation.getName());

    // setup the header fields
    Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
    hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
    hInfo.add(
        new VCFFilterHeaderLine(
            "bootstrap",
            "This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

    bootstrapVCFOutput.writeHeader(
        new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
  }

  public static class CachingFormatter {
    private int maxCacheSize = 0;
    private String format;
    private LRUCache<Double, String> cache;

    public String getFormat() {
      return format;
    }

    public String format(double value) {
      String f = cache.get(value);
      if (f == null) {
        f = String.format(format, value);
        cache.put(value, f);
        //                if ( cache.usedEntries() < maxCacheSize ) {
        //                    System.out.printf("CACHE size %d%n", cache.usedEntries());
        //                } else {
        //                    System.out.printf("CACHE is full %f%n", value);
        //                }
        //            }
        //            } else {
        //                System.out.printf("CACHE hit %f%n", value);
        //            }
      }

      return f;
    }

    public CachingFormatter(String format, int maxCacheSize) {
      this.maxCacheSize = maxCacheSize;
      this.format = format;
      this.cache = new LRUCache<Double, String>(maxCacheSize);
    }
  }

  /**
   * An LRU cache, based on <code>LinkedHashMap</code>.
   *
   * <p>This cache has a fixed maximum number of elements (<code>cacheSize</code>). If the cache is
   * full and another entry is added, the LRU (least recently used) entry is dropped.
   *
   * <p>This class is thread-safe. All methods of this class are synchronized.
   *
   * <p>Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br>
   * Multi-licensed: EPL / LGPL / GPL / AL / BSD.
   */
  public static class LRUCache<K, V> {

    private static final float hashTableLoadFactor = 0.75f;

    private LinkedHashMap<K, V> map;
    private int cacheSize;

    /**
     * Creates a new LRU cache.
     *
     * @param cacheSize the maximum number of entries that will be kept in this cache.
     */
    public LRUCache(int cacheSize) {
      this.cacheSize = cacheSize;
      int hashTableCapacity = (int) Math.ceil(cacheSize / hashTableLoadFactor) + 1;
      map =
          new LinkedHashMap<K, V>(hashTableCapacity, hashTableLoadFactor, true) {
            // (an anonymous inner class)
            private static final long serialVersionUID = 1;

            @Override
            protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
              return size() > LRUCache.this.cacheSize;
            }
          };
    }

    /**
     * Retrieves an entry from the cache.<br>
     * The retrieved entry becomes the MRU (most recently used) entry.
     *
     * @param key the key whose associated value is to be returned.
     * @return the value associated to this key, or null if no value with this key exists in the
     *     cache.
     */
    public synchronized V get(K key) {
      return map.get(key);
    }

    /**
     * Adds an entry to this cache. The new entry becomes the MRU (most recently used) entry. If an
     * entry with the specified key already exists in the cache, it is replaced by the new entry. If
     * the cache is full, the LRU (least recently used) entry is removed from the cache.
     *
     * @param key the key with which the specified value is to be associated.
     * @param value a value to be associated with the specified key.
     */
    public synchronized void put(K key, V value) {
      map.put(key, value);
    }

    /** Clears the cache. */
    public synchronized void clear() {
      map.clear();
    }

    /**
     * Returns the number of used entries in the cache.
     *
     * @return the number of entries currently in the cache.
     */
    public synchronized int usedEntries() {
      return map.size();
    }

    /**
     * Returns a <code>Collection</code> that contains a copy of all cache entries.
     *
     * @return a <code>Collection</code> with a copy of the cache content.
     */
    public synchronized Collection<Map.Entry<K, V>> getAll() {
      return new ArrayList<Map.Entry<K, V>>(map.entrySet());
    }
  } // end class LRUCache
}
Example #10
0
  public void writeBeagleOutput(
      VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
    GenomeLoc currentLoc =
        VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
    StringBuffer beagleOut = new StringBuffer();

    String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart());
    beagleOut.append(marker);
    if (markers != null)
      markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
    for (Allele allele : preferredVC.getAlleles()) {
      String bglPrintString;
      if (allele.isNoCall() || allele.isNull()) bglPrintString = "-";
      else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele

      beagleOut.append(String.format("%s ", bglPrintString));
      if (markers != null) markers.append(bglPrintString).append("\t");
    }
    if (markers != null) markers.append("\n");

    GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
    GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
    for (String sample : samples) {
      boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

      Genotype genotype;
      boolean isValidation;
      // use sample as key into genotypes structure
      if (preferredGenotypes.containsSample(sample)) {
        genotype = preferredGenotypes.get(sample);
        isValidation = isValidationSite;
      } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) {
        genotype = otherGenotypes.get(sample);
        isValidation = !isValidationSite;
      } else {
        // there is magically no genotype for this sample.
        throw new StingException(
            "Sample "
                + sample
                + " arose with no genotype in variant or validation VCF. This should never happen.");
      }

      /*
       * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
       */
      double[] log10Likelihoods = null;
      if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) {
        log10Likelihoods = genotype.getLikelihoods().getAsVector();

        // see if we need to randomly mask out genotype in this position.
        if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) {
          // we are masking out this genotype
          log10Likelihoods =
              isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
        }

        if (isMaleOnChrX) {
          log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
        }
      }
      /** otherwise, use the prior uniformly */
      else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) {
        // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called
        // genotype
        // is confident.  This is useful for Hapmap and 1KG release VCFs.
        double AA = (1.0 - prior) / 2.0;
        double AB = (1.0 - prior) / 2.0;
        double BB = (1.0 - prior) / 2.0;

        if (genotype.isHomRef()) {
          AA = prior;
        } else if (genotype.isHet()) {
          AB = prior;
        } else if (genotype.isHomVar()) {
          BB = prior;
        }

        log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB});
      } else {
        log10Likelihoods =
            isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
      }

      writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
    }

    beagleWriter.println(beagleOut.toString());
  }
Example #11
0
  public final List<Sample> parse(
      Reader reader, EnumSet<MissingPedField> missingFields, SampleDB sampleDB) {
    final List<String> lines = new XReadLines(reader).readLines();

    // What are the record offsets?
    final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0;
    final int samplePos = familyPos + 1;
    final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1;
    final int maternalPos =
        missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1;
    final int sexPos =
        missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1;
    final int phenotypePos =
        missingFields.contains(MissingPedField.NO_PHENOTYPE)
            ? -1
            : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1;
    final int nExpectedFields =
        MathUtils.arrayMaxInt(
                Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos))
            + 1;

    // go through once and determine properties
    int lineNo = 1;
    boolean isQT = false;
    final List<String[]> splits = new ArrayList<String[]>(lines.size());
    for (final String line : lines) {
      if (line.startsWith(commentMarker)) continue;
      if (line.trim().equals("")) continue;

      final String[] parts = line.split("\\s+");

      if (parts.length != nExpectedFields)
        throw new UserException.MalformedFile(
            reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields");

      if (phenotypePos != -1) {
        isQT = isQT || !CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]);
      }

      splits.add(parts);
      lineNo++;
    }
    logger.info("Phenotype is other? " + isQT);

    // now go through and parse each record
    lineNo = 1;
    final List<Sample> samples = new ArrayList<Sample>(splits.size());
    for (final String[] parts : splits) {
      String familyID = null, individualID, paternalID = null, maternalID = null;
      Gender sex = Gender.UNKNOWN;
      String quantitativePhenotype = Sample.UNSET_QT;
      Affection affection = Affection.UNKNOWN;

      if (familyPos != -1) familyID = maybeMissing(parts[familyPos]);
      individualID = parts[samplePos];
      if (paternalPos != -1) paternalID = maybeMissing(parts[paternalPos]);
      if (maternalPos != -1) maternalID = maybeMissing(parts[maternalPos]);

      if (sexPos != -1) {
        if (parts[sexPos].equals(SEX_MALE)) sex = Gender.MALE;
        else if (parts[sexPos].equals(SEX_FEMALE)) sex = Gender.FEMALE;
        else sex = Gender.UNKNOWN;
      }

      if (phenotypePos != -1) {
        if (isQT) {
          if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN;
          else {
            affection = Affection.OTHER;
            quantitativePhenotype = parts[phenotypePos];
          }
        } else {
          if (parts[phenotypePos].equals(MISSING_VALUE1)) affection = Affection.UNKNOWN;
          else if (parts[phenotypePos].equals(MISSING_VALUE2)) affection = Affection.UNKNOWN;
          else if (parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED))
            affection = Affection.UNAFFECTED;
          else if (parts[phenotypePos].equals(PHENOTYPE_AFFECTED)) affection = Affection.AFFECTED;
          else
            throw new ReviewedStingException(
                "Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo);
        }
      }

      final Sample s =
          new Sample(
              individualID,
              sampleDB,
              familyID,
              paternalID,
              maternalID,
              sex,
              affection,
              quantitativePhenotype);
      samples.add(s);
      sampleDB.addSample(s);
      lineNo++;
    }

    for (final Sample sample : new ArrayList<Sample>(samples)) {
      Sample dad =
          maybeAddImplicitSample(
              sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE);
      if (dad != null) samples.add(dad);

      Sample mom =
          maybeAddImplicitSample(
              sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE);
      if (mom != null) samples.add(mom);
    }

    return samples;
  }
 private final double getRefBinomialProb(final int depth) {
   if (depth < binomialProbabilityDepthCache.length) return binomialProbabilityDepthCache[depth];
   else return MathUtils.binomialProbability(0, depth, 0.5);
 }
 static {
   for (int i = 1; i < binomialProbabilityDepthCache.length; i++) {
     binomialProbabilityDepthCache[i] = MathUtils.binomialProbability(0, i, 0.5);
   }
 }