Example #1
0
  private void initializeVcfWriter() {
    final List<String> inputNames = Arrays.asList(validation.getName());

    // setup the header fields
    Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
    hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
    hInfo.add(
        new VCFFilterHeaderLine(
            "bootstrap",
            "This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

    bootstrapVCFOutput.writeHeader(
        new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
  }
Example #2
0
  public void initialize() {

    samples =
        SampleUtils.getSampleListWithVCFHeader(
            getToolkit(), Arrays.asList(variantCollection.variants.getName()));

    beagleWriter.print("marker alleleA alleleB");
    for (String sample : samples)
      beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));

    beagleWriter.println();

    if (bootstrapVCFOutput != null) {
      initializeVcfWriter();
    }

    if (VQSRCalibrationFile != null) {
      VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
      logger.info("Read calibration curve");
      VQSRCalibrator.printInfo(logger);
    }
  }
Example #3
0
/**
 * Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
 *
 * <p>
 *
 * <h2>Input</h2>
 *
 * <p>A VCF with variants to convert to Beagle format
 *
 * <h2>Outputs</h2>
 *
 * <p>A single text file which can be fed to Beagle
 *
 * <p>Optional: A file with a list of markers
 *
 * <h2>Examples</h2>
 *
 * <pre>
 *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
 *      -R reffile.fasta -T ProduceBeagleInput \
 *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
 * </pre>
 */
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {

  @ArgumentCollection
  protected StandardVariantContextInputArgumentCollection variantCollection =
      new StandardVariantContextInputArgumentCollection();

  @Hidden
  @Input(
      fullName = "validation",
      shortName = "validation",
      doc = "Validation VCF file",
      required = false)
  public RodBinding<VariantContext> validation;

  @Output(doc = "File to which BEAGLE input should be written", required = true)
  protected PrintStream beagleWriter = null;

  @Hidden
  @Output(
      doc = "File to which BEAGLE markers should be written",
      shortName = "markers",
      fullName = "markers",
      required = false)
  protected PrintStream markers = null;

  int markerCounter = 1;

  @Hidden
  @Input(doc = "VQSqual calibration file", shortName = "cc", required = false)
  protected File VQSRCalibrationFile = null;

  protected VQSRCalibrationCurve VQSRCalibrator = null;

  @Hidden
  @Argument(doc = "VQSqual key", shortName = "vqskey", required = false)
  protected String VQSLOD_KEY = "VQSqual";

  @Hidden
  @Argument(
      fullName = "inserted_nocall_rate",
      shortName = "nc_rate",
      doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing",
      required = false)
  public double insertedNoCallRate = 0;

  @Hidden
  @Argument(
      fullName = "validation_genotype_ptrue",
      shortName = "valp",
      doc = "Flat probability to assign to validation genotypes. Will override GL field.",
      required = false)
  public double validationPrior = -1.0;

  @Hidden
  @Argument(
      fullName = "validation_bootstrap",
      shortName = "bs",
      doc = "Proportion of records to be used in bootstrap set",
      required = false)
  public double bootstrap = 0.0;

  @Hidden
  @Argument(
      fullName = "bootstrap_vcf",
      shortName = "bvcf",
      doc = "Output a VCF with the records used for bootstrapping filtered out",
      required = false)
  VCFWriter bootstrapVCFOutput = null;

  /**
   * If sample gender is known, this flag should be set to true to ensure that Beagle treats male
   * Chr X properly.
   */
  @Argument(
      fullName = "checkIsMaleOnChrX",
      shortName = "checkIsMaleOnChrX",
      doc =
          "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.",
      required = false)
  public boolean CHECK_IS_MALE_ON_CHR_X = false;

  @Hidden
  @Argument(
      fullName = "variant_genotype_ptrue",
      shortName = "varp",
      doc =
          "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.",
      required = false)
  public double variantPrior = 0.96;

  private Set<String> samples = null;
  private Set<String> BOOTSTRAP_FILTER = new HashSet<String>(Arrays.asList("bootstrap"));
  private int bootstrapSetSize = 0;
  private int testSetSize = 0;
  private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000);
  private int certainFPs = 0;

  public void initialize() {

    samples =
        SampleUtils.getSampleListWithVCFHeader(
            getToolkit(), Arrays.asList(variantCollection.variants.getName()));

    beagleWriter.print("marker alleleA alleleB");
    for (String sample : samples)
      beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));

    beagleWriter.println();

    if (bootstrapVCFOutput != null) {
      initializeVcfWriter();
    }

    if (VQSRCalibrationFile != null) {
      VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
      logger.info("Read calibration curve");
      VQSRCalibrator.printInfo(logger);
    }
  }

  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    if (tracker != null) {
      GenomeLoc loc = context.getLocation();
      VariantContext variant_eval = tracker.getFirstValue(variantCollection.variants, loc);
      VariantContext validation_eval = tracker.getFirstValue(validation, loc);

      if (goodSite(variant_eval, validation_eval)) {
        if (useValidation(validation_eval, ref)) {
          writeBeagleOutput(validation_eval, variant_eval, true, validationPrior);
          return 1;
        } else {
          if (goodSite(variant_eval)) {
            writeBeagleOutput(variant_eval, validation_eval, false, variantPrior);
            return 1;
          } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap
            // set -- what do?
            return 0;
          }
        }
      } else {
        return 0;
      }
    } else {
      return 0;
    }
  }

  public boolean goodSite(VariantContext a, VariantContext b) {
    return goodSite(a) || goodSite(b);
  }

  public boolean goodSite(VariantContext v) {
    if (canBeOutputToBeagle(v)) {
      if (VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v)) {
        certainFPs++;
        return false;
      } else {
        return true;
      }
    } else {
      return false;
    }
  }

  public static boolean canBeOutputToBeagle(VariantContext v) {
    return v != null && !v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
  }

  public boolean useValidation(VariantContext validation, ReferenceContext ref) {
    if (goodSite(validation)) {
      // if using record keeps us below expected proportion, use it
      logger.debug(
          String.format(
              "boot: %d, test: %d, total: %d",
              bootstrapSetSize, testSetSize, bootstrapSetSize + testSetSize + 1));
      if ((bootstrapSetSize + 1.0) / (1.0 + bootstrapSetSize + testSetSize) <= bootstrap) {
        if (bootstrapVCFOutput != null) {
          bootstrapVCFOutput.add(
              new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
        }
        bootstrapSetSize++;
        return true;
      } else {
        if (bootstrapVCFOutput != null) {
          bootstrapVCFOutput.add(validation);
        }
        testSetSize++;
        return false;
      }
    } else {
      if (validation != null && bootstrapVCFOutput != null) {
        bootstrapVCFOutput.add(validation);
      }
      return false;
    }
  }

  private static final double[] HAPLOID_FLAT_LOG10_LIKELIHOODS =
      MathUtils.toLog10(new double[] {0.5, 0.0, 0.5});
  private static final double[] DIPLOID_FLAT_LOG10_LIKELIHOODS =
      MathUtils.toLog10(new double[] {0.33, 0.33, 0.33});

  public void writeBeagleOutput(
      VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
    GenomeLoc currentLoc =
        VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
    StringBuffer beagleOut = new StringBuffer();

    String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart());
    beagleOut.append(marker);
    if (markers != null)
      markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
    for (Allele allele : preferredVC.getAlleles()) {
      String bglPrintString;
      if (allele.isNoCall() || allele.isNull()) bglPrintString = "-";
      else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele

      beagleOut.append(String.format("%s ", bglPrintString));
      if (markers != null) markers.append(bglPrintString).append("\t");
    }
    if (markers != null) markers.append("\n");

    GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
    GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
    for (String sample : samples) {
      boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

      Genotype genotype;
      boolean isValidation;
      // use sample as key into genotypes structure
      if (preferredGenotypes.containsSample(sample)) {
        genotype = preferredGenotypes.get(sample);
        isValidation = isValidationSite;
      } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) {
        genotype = otherGenotypes.get(sample);
        isValidation = !isValidationSite;
      } else {
        // there is magically no genotype for this sample.
        throw new StingException(
            "Sample "
                + sample
                + " arose with no genotype in variant or validation VCF. This should never happen.");
      }

      /*
       * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
       */
      double[] log10Likelihoods = null;
      if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) {
        log10Likelihoods = genotype.getLikelihoods().getAsVector();

        // see if we need to randomly mask out genotype in this position.
        if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) {
          // we are masking out this genotype
          log10Likelihoods =
              isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
        }

        if (isMaleOnChrX) {
          log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
        }
      }
      /** otherwise, use the prior uniformly */
      else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) {
        // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called
        // genotype
        // is confident.  This is useful for Hapmap and 1KG release VCFs.
        double AA = (1.0 - prior) / 2.0;
        double AB = (1.0 - prior) / 2.0;
        double BB = (1.0 - prior) / 2.0;

        if (genotype.isHomRef()) {
          AA = prior;
        } else if (genotype.isHet()) {
          AB = prior;
        } else if (genotype.isHomVar()) {
          BB = prior;
        }

        log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB});
      } else {
        log10Likelihoods =
            isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
      }

      writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
    }

    beagleWriter.println(beagleOut.toString());
  }

  private void writeSampleLikelihoods(
      StringBuffer out, VariantContext vc, double[] log10Likelihoods) {
    if (VQSRCalibrator != null) {
      log10Likelihoods =
          VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
    }

    double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
    // see if we need to randomly mask out genotype in this position.
    for (double likeVal : normalizedLikelihoods) {
      out.append(formatter.format(likeVal));
      //            out.append(String.format("%5.4f ",likeVal));
    }
  }

  public Integer reduceInit() {
    return 0; // Nothing to do here
  }

  public Integer reduce(Integer value, Integer sum) {
    return value + sum; // count up the sites
  }

  public void onTraversalDone(Integer includedSites) {
    logger.info("Sites included in beagle likelihoods file             : " + includedSites);
    logger.info(
        String.format(
            "Certain false positive found from recalibration curve : %d (%.2f%%)",
            certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1))));
  }

  private void initializeVcfWriter() {
    final List<String> inputNames = Arrays.asList(validation.getName());

    // setup the header fields
    Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
    hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
    hInfo.add(
        new VCFFilterHeaderLine(
            "bootstrap",
            "This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

    bootstrapVCFOutput.writeHeader(
        new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
  }

  public static class CachingFormatter {
    private int maxCacheSize = 0;
    private String format;
    private LRUCache<Double, String> cache;

    public String getFormat() {
      return format;
    }

    public String format(double value) {
      String f = cache.get(value);
      if (f == null) {
        f = String.format(format, value);
        cache.put(value, f);
        //                if ( cache.usedEntries() < maxCacheSize ) {
        //                    System.out.printf("CACHE size %d%n", cache.usedEntries());
        //                } else {
        //                    System.out.printf("CACHE is full %f%n", value);
        //                }
        //            }
        //            } else {
        //                System.out.printf("CACHE hit %f%n", value);
        //            }
      }

      return f;
    }

    public CachingFormatter(String format, int maxCacheSize) {
      this.maxCacheSize = maxCacheSize;
      this.format = format;
      this.cache = new LRUCache<Double, String>(maxCacheSize);
    }
  }

  /**
   * An LRU cache, based on <code>LinkedHashMap</code>.
   *
   * <p>This cache has a fixed maximum number of elements (<code>cacheSize</code>). If the cache is
   * full and another entry is added, the LRU (least recently used) entry is dropped.
   *
   * <p>This class is thread-safe. All methods of this class are synchronized.
   *
   * <p>Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br>
   * Multi-licensed: EPL / LGPL / GPL / AL / BSD.
   */
  public static class LRUCache<K, V> {

    private static final float hashTableLoadFactor = 0.75f;

    private LinkedHashMap<K, V> map;
    private int cacheSize;

    /**
     * Creates a new LRU cache.
     *
     * @param cacheSize the maximum number of entries that will be kept in this cache.
     */
    public LRUCache(int cacheSize) {
      this.cacheSize = cacheSize;
      int hashTableCapacity = (int) Math.ceil(cacheSize / hashTableLoadFactor) + 1;
      map =
          new LinkedHashMap<K, V>(hashTableCapacity, hashTableLoadFactor, true) {
            // (an anonymous inner class)
            private static final long serialVersionUID = 1;

            @Override
            protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
              return size() > LRUCache.this.cacheSize;
            }
          };
    }

    /**
     * Retrieves an entry from the cache.<br>
     * The retrieved entry becomes the MRU (most recently used) entry.
     *
     * @param key the key whose associated value is to be returned.
     * @return the value associated to this key, or null if no value with this key exists in the
     *     cache.
     */
    public synchronized V get(K key) {
      return map.get(key);
    }

    /**
     * Adds an entry to this cache. The new entry becomes the MRU (most recently used) entry. If an
     * entry with the specified key already exists in the cache, it is replaced by the new entry. If
     * the cache is full, the LRU (least recently used) entry is removed from the cache.
     *
     * @param key the key with which the specified value is to be associated.
     * @param value a value to be associated with the specified key.
     */
    public synchronized void put(K key, V value) {
      map.put(key, value);
    }

    /** Clears the cache. */
    public synchronized void clear() {
      map.clear();
    }

    /**
     * Returns the number of used entries in the cache.
     *
     * @return the number of entries currently in the cache.
     */
    public synchronized int usedEntries() {
      return map.size();
    }

    /**
     * Returns a <code>Collection</code> that contains a copy of all cache entries.
     *
     * @return a <code>Collection</code> with a copy of the cache content.
     */
    public synchronized Collection<Map.Entry<K, V>> getAll() {
      return new ArrayList<Map.Entry<K, V>>(map.entrySet());
    }
  } // end class LRUCache
}
Example #4
0
  /** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */
  public void initialize() {
    // Get list of samples to include in the output
    List<String> rodNames = Arrays.asList(variantCollection.variants.getName());

    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
    TreeSet<String> vcfSamples =
        new TreeSet<String>(
            SampleUtils.getSampleList(
                vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

    Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
    Collection<String> samplesFromExpressions =
        SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);

    // first, add any requested samples
    samples.addAll(samplesFromFile);
    samples.addAll(samplesFromExpressions);
    samples.addAll(sampleNames);

    // if none were requested, we want all of them
    if (samples.isEmpty()) {
      samples.addAll(vcfSamples);
      NO_SAMPLES_SPECIFIED = true;
    }

    // now, exclude any requested samples
    Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
    samples.removeAll(XLsamplesFromFile);
    samples.removeAll(XLsampleNames);

    if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED)
      throw new UserException(
          "All samples requested to be included were also requested to be excluded.");

    for (String sample : samples) logger.info("Including sample '" + sample + "'");

    // if user specified types to include, add these, otherwise, add all possible variant context
    // types to list of vc types to include
    if (TYPES_TO_INCLUDE.isEmpty()) {

      for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t);

    } else {
      for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t);
    }
    // Initialize VCF header
    Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
    headerLines.add(new VCFHeaderLine("source", "SelectVariants"));

    if (KEEP_ORIGINAL_CHR_COUNTS) {
      headerLines.add(
          new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
      headerLines.add(
          new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
      headerLines.add(
          new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
    }
    vcfWriter.writeHeader(new VCFHeader(headerLines, samples));

    for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
      // It's not necessary that the user supply select names for the JEXL expressions, since those
      // expressions will only be needed for omitting records.  Make up the select names here.
      selectNames.add(String.format("select-%d", i));
    }

    jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);

    // Look at the parameters to decide which analysis to perform
    DISCORDANCE_ONLY = discordanceTrack.isBound();
    if (DISCORDANCE_ONLY)
      logger.info(
          "Selecting only variants discordant with the track: " + discordanceTrack.getName());

    CONCORDANCE_ONLY = concordanceTrack.isBound();
    if (CONCORDANCE_ONLY)
      logger.info(
          "Selecting only variants concordant with the track: " + concordanceTrack.getName());

    if (MENDELIAN_VIOLATIONS) {
      if (FAMILY_STRUCTURE_FILE != null) {
        try {
          for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) {
            MendelianViolation mv =
                new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
            if (samples.contains(mv.getSampleChild())
                && samples.contains(mv.getSampleDad())
                && samples.contains(mv.getSampleMom())) mvSet.add(mv);
          }
        } catch (FileNotFoundException e) {
          throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e);
        }
        if (outMVFile != null)
          try {
            outMVFileStream = new PrintStream(outMVFile);
          } catch (FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile(
                outMVFile, "Can't open output file", e);
          }
      } else
        mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
    } else if (!FAMILY_STRUCTURE.isEmpty()) {
      mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
      MENDELIAN_VIOLATIONS = true;
    }

    SELECT_RANDOM_NUMBER = numRandom > 0;
    if (SELECT_RANDOM_NUMBER) {
      logger.info("Selecting " + numRandom + " variants at random from the variant track");
      variantArray = new RandomVariantStructure[numRandom];
    }

    SELECT_RANDOM_FRACTION = fractionRandom > 0;
    if (SELECT_RANDOM_FRACTION)
      logger.info(
          "Selecting approximately "
              + 100.0 * fractionRandom
              + "% of the variants at random from the variant track");
  }