Exemplo n.º 1
0
  private boolean isConcordant(VariantContext vc, Collection<VariantContext> compVCs) {
    if (vc == null || compVCs == null || compVCs.isEmpty()) return false;

    // if we're not looking for specific samples then the fact that we have both VCs is enough to
    // call it concordant.
    if (NO_SAMPLES_SPECIFIED) return true;

    // make a list of all samples contained in this variant VC that are being tracked by the user
    // command line arguments.
    Set<String> variantSamples = vc.getSampleNames();
    variantSamples.retainAll(samples);

    // check if we can find all samples from the variant rod in the comp rod.
    for (String sample : variantSamples) {
      boolean foundSample = false;
      for (VariantContext compVC : compVCs) {
        Genotype varG = vc.getGenotype(sample);
        Genotype compG = compVC.getGenotype(sample);
        if (haveSameGenotypes(varG, compG)) {
          foundSample = true;
          break;
        }
      }
      // if at least one sample doesn't have the same genotype, we don't have concordance
      if (!foundSample) {
        return false;
      }
    }
    return true;
  }
Exemplo n.º 2
0
 public int getNumberOfSamplesForEvaluation() {
   if (sampleNamesForEvaluation != null && !sampleNamesForEvaluation.isEmpty())
     return sampleNamesForEvaluation.size();
   else {
     return numSamplesFromArgument;
   }
 }
Exemplo n.º 3
0
  private void initializeVcfWriter() {
    final List<String> inputNames = Arrays.asList(validation.getName());

    // setup the header fields
    Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
    hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
    hInfo.add(
        new VCFFilterHeaderLine(
            "bootstrap",
            "This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

    bootstrapVCFOutput.writeHeader(
        new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
  }
Exemplo n.º 4
0
  /**
   * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN,
   * AC, AF).
   *
   * @param vc the VariantContext record to subset
   * @param samples the samples to extract
   * @return the subsetted VariantContext
   */
  private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
    if (samples == null || samples.isEmpty()) return vc;

    ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
    for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) {
      if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue());
    }

    VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles());

    // if we have fewer alternate alleles in the selected VC than in the original VC, we need to
    // strip out the GL/PLs (because they are no longer accurate)
    if (vc.getAlleles().size() != sub.getAlleles().size())
      sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes()));

    HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes());

    int depth = 0;
    for (String sample : sub.getSampleNames()) {
      Genotype g = sub.getGenotype(sample);

      if (g.isNotFiltered() && g.isCalled()) {

        String dp = (String) g.getAttribute("DP");
        if (dp != null
            && !dp.equals(VCFConstants.MISSING_DEPTH_v3)
            && !dp.equals(VCFConstants.MISSING_VALUE_v4)) {
          depth += Integer.valueOf(dp);
        }
      }
    }

    if (KEEP_ORIGINAL_CHR_COUNTS) {
      if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY))
        attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY));
      if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY))
        attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY));
      if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY))
        attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY));
    }

    VariantContextUtils.calculateChromosomeCounts(sub, attributes, false);
    attributes.put("DP", depth);

    sub = VariantContext.modifyAttributes(sub, attributes);

    return sub;
  }
  /**
   * Compares the covariate report lists.
   *
   * @param diffs map where to annotate the difference.
   * @param other the argument collection to compare against.
   * @param thisRole the name for this argument collection that makes sense to the user.
   * @param otherRole the name for the other argument collection that makes sense to the end user.
   * @return <code>true</code> if a difference was found.
   */
  @Requires("diffs != null && other != null && thisRole != null && otherRole != null")
  private boolean compareRequestedCovariates(
      final Map<String, String> diffs,
      final RecalibrationArgumentCollection other,
      final String thisRole,
      final String otherRole) {

    final Set<String> beforeNames = new HashSet<>(this.COVARIATES.length);
    final Set<String> afterNames = new HashSet<>(other.COVARIATES.length);
    Utils.addAll(beforeNames, this.COVARIATES);
    Utils.addAll(afterNames, other.COVARIATES);
    final Set<String> intersect = new HashSet<>(Math.min(beforeNames.size(), afterNames.size()));
    intersect.addAll(beforeNames);
    intersect.retainAll(afterNames);

    String diffMessage = null;
    if (intersect.size()
        == 0) { // In practice this is not possible due to required covariates but...
      diffMessage =
          String.format(
              "There are no common covariates between '%s' and '%s'"
                  + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",
              thisRole,
              otherRole,
              thisRole,
              Utils.join(", ", this.COVARIATES),
              otherRole,
              Utils.join(",", other.COVARIATES));
    } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) {
      beforeNames.removeAll(intersect);
      afterNames.removeAll(intersect);
      diffMessage =
          String.format(
              "There are differences in the set of covariates requested in the"
                  + " '%s' and '%s' recalibrator reports. "
                  + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",
              thisRole,
              otherRole,
              thisRole,
              Utils.join(", ", beforeNames),
              otherRole,
              Utils.join(", ", afterNames));
    }
    if (diffMessage != null) {
      diffs.put("covariate", diffMessage);
      return true;
    } else {
      return false;
    }
  }
  /** Initialize the stratifications, evaluations, evaluation contexts, and reporting object */
  public void initialize() {
    // Just list the modules, and exit quickly.
    if (LIST) {
      variantEvalUtils.listModulesAndExit();
    }

    // maintain the full list of comps
    comps.addAll(compsProvided);
    if (dbsnp.dbsnp.isBound()) {
      comps.add(dbsnp.dbsnp);
      knowns.add(dbsnp.dbsnp);
    }

    // Add a dummy comp track if none exists
    if (comps.size() == 0)
      comps.add(
          new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

    // Set up set of additional knowns
    for (RodBinding<VariantContext> compRod : comps) {
      if (KNOWN_NAMES.contains(compRod.getName())) knowns.add(compRod);
    }

    // Now that we have all the rods categorized, determine the sample list from the eval rods.
    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
    Set<String> vcfSamples =
        SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

    // Load the sample list
    sampleNamesForEvaluation.addAll(
        SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
    numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size();

    if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
      sampleNamesForStratification.addAll(sampleNamesForEvaluation);
    }
    sampleNamesForStratification.add(ALL_SAMPLE_NAME);

    // Initialize select expressions
    for (VariantContextUtils.JexlVCMatchExp jexl :
        VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
      SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
      jexlExpressions.add(sjexl);
    }

    // Initialize the set of stratifications and evaluations to use
    stratificationObjects =
        variantEvalUtils.initializeStratificationObjects(
            this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
    Set<Class<? extends VariantEvaluator>> evaluationObjects =
        variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
    for (VariantStratifier vs : getStratificationObjects()) {
      if (vs.getName().equals("Filter")) byFilterIsEnabled = true;
      else if (vs.getName().equals("Sample")) perSampleIsEnabled = true;
    }

    if (intervalsFile != null) {
      boolean fail = true;
      for (final VariantStratifier vs : stratificationObjects) {
        if (vs.getClass().equals(IntervalStratification.class)) fail = false;
      }
      if (fail)
        throw new UserException.BadArgumentValue(
            "ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
    }

    // Initialize the evaluation contexts
    evaluationContexts =
        variantEvalUtils.initializeEvaluationContexts(
            stratificationObjects, evaluationObjects, null, null);

    // Initialize report table
    report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects);

    // Load ancestral alignments
    if (ancestralAlignmentsFile != null) {
      try {
        ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
      } catch (FileNotFoundException e) {
        throw new ReviewedStingException(
            String.format(
                "The ancestral alignments file, '%s', could not be found",
                ancestralAlignmentsFile.getAbsolutePath()));
      }
    }

    // initialize CNVs
    if (knownCNVsFile != null) {
      knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
    }
  }
Exemplo n.º 7
0
  /** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */
  public void initialize() {
    // Get list of samples to include in the output
    List<String> rodNames = Arrays.asList(variantCollection.variants.getName());

    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
    TreeSet<String> vcfSamples =
        new TreeSet<String>(
            SampleUtils.getSampleList(
                vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

    Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
    Collection<String> samplesFromExpressions =
        SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);

    // first, add any requested samples
    samples.addAll(samplesFromFile);
    samples.addAll(samplesFromExpressions);
    samples.addAll(sampleNames);

    // if none were requested, we want all of them
    if (samples.isEmpty()) {
      samples.addAll(vcfSamples);
      NO_SAMPLES_SPECIFIED = true;
    }

    // now, exclude any requested samples
    Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
    samples.removeAll(XLsamplesFromFile);
    samples.removeAll(XLsampleNames);

    if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED)
      throw new UserException(
          "All samples requested to be included were also requested to be excluded.");

    for (String sample : samples) logger.info("Including sample '" + sample + "'");

    // if user specified types to include, add these, otherwise, add all possible variant context
    // types to list of vc types to include
    if (TYPES_TO_INCLUDE.isEmpty()) {

      for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t);

    } else {
      for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t);
    }
    // Initialize VCF header
    Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
    headerLines.add(new VCFHeaderLine("source", "SelectVariants"));

    if (KEEP_ORIGINAL_CHR_COUNTS) {
      headerLines.add(
          new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
      headerLines.add(
          new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
      headerLines.add(
          new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
    }
    vcfWriter.writeHeader(new VCFHeader(headerLines, samples));

    for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
      // It's not necessary that the user supply select names for the JEXL expressions, since those
      // expressions will only be needed for omitting records.  Make up the select names here.
      selectNames.add(String.format("select-%d", i));
    }

    jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);

    // Look at the parameters to decide which analysis to perform
    DISCORDANCE_ONLY = discordanceTrack.isBound();
    if (DISCORDANCE_ONLY)
      logger.info(
          "Selecting only variants discordant with the track: " + discordanceTrack.getName());

    CONCORDANCE_ONLY = concordanceTrack.isBound();
    if (CONCORDANCE_ONLY)
      logger.info(
          "Selecting only variants concordant with the track: " + concordanceTrack.getName());

    if (MENDELIAN_VIOLATIONS) {
      if (FAMILY_STRUCTURE_FILE != null) {
        try {
          for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) {
            MendelianViolation mv =
                new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
            if (samples.contains(mv.getSampleChild())
                && samples.contains(mv.getSampleDad())
                && samples.contains(mv.getSampleMom())) mvSet.add(mv);
          }
        } catch (FileNotFoundException e) {
          throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e);
        }
        if (outMVFile != null)
          try {
            outMVFileStream = new PrintStream(outMVFile);
          } catch (FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile(
                outMVFile, "Can't open output file", e);
          }
      } else
        mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
    } else if (!FAMILY_STRUCTURE.isEmpty()) {
      mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
      MENDELIAN_VIOLATIONS = true;
    }

    SELECT_RANDOM_NUMBER = numRandom > 0;
    if (SELECT_RANDOM_NUMBER) {
      logger.info("Selecting " + numRandom + " variants at random from the variant track");
      variantArray = new RandomVariantStructure[numRandom];
    }

    SELECT_RANDOM_FRACTION = fractionRandom > 0;
    if (SELECT_RANDOM_FRACTION)
      logger.info(
          "Selecting approximately "
              + 100.0 * fractionRandom
              + "% of the variants at random from the variant track");
  }