// decide whether we are currently processing SNPs, indels, neither, or both
  private List<GenotypeLikelihoodsCalculationModel.Model> getGLModelsToUse(
      final RefMetaDataTracker tracker,
      final ReferenceContext refContext,
      final AlignmentContext rawContext) {

    final List<GenotypeLikelihoodsCalculationModel.Model> models =
        new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
    String modelPrefix = "";
    if (UAC.GLmodel.name().toUpperCase().contains("BOTH"))
      modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH", "");

    if (!UAC.GLmodel.name().contains(GPSTRING)
        && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
      modelPrefix = GPSTRING + modelPrefix;

    // if we're genotyping given alleles and we have a requested SNP at this position, do SNP
    if (UAC.GenotypingMode
        == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
      final VariantContext vcInput =
          getVCFromAllelesRod(
              tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
      if (vcInput == null) return models;

      if (vcInput.isSNP()) {
        // ignore SNPs if the user chose INDEL mode only
        if (UAC.GLmodel.name().toUpperCase().contains("BOTH")
            || UAC.GLmodel.name().toUpperCase().contains("SNP"))
          models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix + "SNP"));
      } else if (vcInput.isIndel() || vcInput.isMixed()) {
        // ignore INDELs if the user chose SNP mode only
        if (UAC.GLmodel.name().toUpperCase().contains("BOTH")
            || UAC.GLmodel.name().toUpperCase().contains("INDEL"))
          models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix + "INDEL"));
      }
      // No support for other types yet
    } else {
      if (UAC.GLmodel.name().toUpperCase().contains("BOTH")) {
        models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix + "SNP"));
        models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix + "INDEL"));
      } else {
        models.add(
            GenotypeLikelihoodsCalculationModel.Model.valueOf(
                modelPrefix + UAC.GLmodel.name().toUpperCase()));
      }
    }

    return models;
  }
Esempio n. 2
0
  static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
    Map<String, Object> mergedAttribs = new HashMap<String, Object>();

    List<VariantContext> vcList = new LinkedList<VariantContext>();
    vcList.add(vc1);
    vcList.add(vc2);

    String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
    for (String orAttrib : MERGE_OR_ATTRIBS) {
      boolean attribVal = false;
      for (VariantContext vc : vcList) {
        attribVal = vc.getAttributeAsBoolean(orAttrib, false);
        if (attribVal) // already true, so no reason to continue:
        break;
      }
      mergedAttribs.put(orAttrib, attribVal);
    }

    return mergedAttribs;
  }
Esempio n. 3
0
  /**
   * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep
   * or all sites (if this is empty)
   *
   * @param reader a just-opened reader sitting at the start of the file
   * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should
   *     be kept
   * @param parser a genome loc parser to create genome locs
   * @return a list of ExactCall objects in reader
   * @throws IOException
   */
  public static List<ExactCall> readExactLog(
      final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser)
      throws IOException {
    if (reader == null) throw new IllegalArgumentException("reader cannot be null");
    if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null");
    if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null");

    List<ExactCall> calls = new LinkedList<ExactCall>();

    // skip the header line
    reader.readLine();

    // skip the first "type" line
    reader.readLine();

    while (true) {
      final VariantContextBuilder builder = new VariantContextBuilder();
      final List<Allele> alleles = new ArrayList<Allele>();
      final List<Genotype> genotypes = new ArrayList<Genotype>();
      final double[] posteriors = new double[2];
      final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true);
      final List<Integer> mle = new ArrayList<Integer>();
      final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>();
      long runtimeNano = -1;

      GenomeLoc currentLoc = null;
      while (true) {
        final String line = reader.readLine();
        if (line == null) return calls;

        final String[] parts = line.split("\t");
        final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]);
        final String variable = parts[1];
        final String key = parts[2];
        final String value = parts[3];

        if (currentLoc == null) currentLoc = lineLoc;

        if (variable.equals("type")) {
          if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) {
            builder.alleles(alleles);
            final int stop = currentLoc.getStart() + alleles.get(0).length() - 1;
            builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
            builder.genotypes(genotypes);
            final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {}));
            final AFCalcResult result =
                new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele);
            calls.add(new ExactCall(builder.make(), runtimeNano, result));
          }
          break;
        } else if (variable.equals("allele")) {
          final boolean isRef = key.equals("0");
          alleles.add(Allele.create(value, isRef));
        } else if (variable.equals("PL")) {
          final GenotypeBuilder gb = new GenotypeBuilder(key);
          gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs());
          genotypes.add(gb.make());
        } else if (variable.equals("log10PosteriorOfAFEq0")) {
          posteriors[0] = Double.valueOf(value);
        } else if (variable.equals("log10PosteriorOfAFGt0")) {
          posteriors[1] = Double.valueOf(value);
        } else if (variable.equals("MLE")) {
          mle.add(Integer.valueOf(value));
        } else if (variable.equals("pNonRefByAllele")) {
          final Allele a = Allele.create(key);
          log10pNonRefByAllele.put(a, Double.valueOf(value));
        } else if (variable.equals("runtime.nano")) {
          runtimeNano = Long.valueOf(value);
        } else {
          // nothing to do
        }
      }
    }
  }
Esempio n. 4
0
  static VariantContext reallyMergeIntoMNP(
      VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
    int startInter = vc1.getEnd() + 1;
    int endInter = vc2.getStart() - 1;
    byte[] intermediateBases = null;
    if (startInter <= endInter) {
      intermediateBases =
          referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
      StringUtil.toUpperCase(intermediateBases);
    }
    MergedAllelesData mergeData =
        new MergedAllelesData(
            intermediateBases, vc1, vc2); // ensures that the reference allele is added

    GenotypesContext mergedGenotypes = GenotypesContext.create();
    for (final Genotype gt1 : vc1.getGenotypes()) {
      Genotype gt2 = vc2.getGenotype(gt1.getSampleName());

      List<Allele> site1Alleles = gt1.getAlleles();
      List<Allele> site2Alleles = gt2.getAlleles();

      List<Allele> mergedAllelesForSample = new LinkedList<Allele>();

      /* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
        we preserve phase information (if any) relative to whatever precedes vc1:
      */
      Iterator<Allele> all2It = site2Alleles.iterator();
      for (Allele all1 : site1Alleles) {
        Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()

        Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
        mergedAllelesForSample.add(mergedAllele);
      }

      double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
      Set<String> mergedGtFilters =
          new HashSet<
              String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered

      Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
      PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
      if (phaseQual.PQ != null) mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);

      Genotype mergedGt =
          new Genotype(
              gt1.getSampleName(),
              mergedAllelesForSample,
              mergedGQ,
              mergedGtFilters,
              mergedGtAttribs,
              phaseQual.isPhased);
      mergedGenotypes.add(mergedGt);
    }

    String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
    double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError());
    Set<String> mergedFilters =
        new HashSet<
            String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
    Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);

    // ids
    List<String> mergedIDs = new ArrayList<String>();
    if (vc1.hasID()) mergedIDs.add(vc1.getID());
    if (vc2.hasID()) mergedIDs.add(vc2.getID());
    String mergedID =
        mergedIDs.isEmpty()
            ? VCFConstants.EMPTY_ID_FIELD
            : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);

    VariantContextBuilder mergedBuilder =
        new VariantContextBuilder(
                mergedName,
                vc1.getChr(),
                vc1.getStart(),
                vc2.getEnd(),
                mergeData.getAllMergedAlleles())
            .id(mergedID)
            .genotypes(mergedGenotypes)
            .log10PError(mergedLog10PError)
            .filters(mergedFilters)
            .attributes(mergedAttribs);
    VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true);
    return mergedBuilder.make();
  }
  /**
   * Main entry function to calculate genotypes of a given VC with corresponding GL's
   *
   * @param tracker Tracker
   * @param refContext Reference context
   * @param rawContext Raw context
   * @param stratifiedContexts Stratified alignment contexts
   * @param vc Input VC
   * @param model GL calculation model
   * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc
   * @return VC with assigned genotypes
   */
  public VariantCallContext calculateGenotypes(
      final RefMetaDataTracker tracker,
      final ReferenceContext refContext,
      final AlignmentContext rawContext,
      Map<String, AlignmentContext> stratifiedContexts,
      final VariantContext vc,
      final GenotypeLikelihoodsCalculationModel.Model model,
      final boolean inheritAttributesFromInputVC,
      final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>
          perReadAlleleLikelihoodMap) {

    boolean limitedContext =
        tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;

    // initialize the data for this thread if that hasn't been done yet
    if (afcm.get() == null) {
      afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
    }

    // estimate our confidence in a reference call and return
    if (vc.getNSamples() == 0) {
      if (limitedContext) return null;
      return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
          ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0)
          : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
    }

    AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));

    // is the most likely frequency conformation AC=0 for all alternate alleles?
    boolean bestGuessIsRef = true;

    // determine which alternate alleles have AF>0
    final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
    final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size());
    myAlleles.add(vc.getReference());
    for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) {
      final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
      if (alternateAllele.isReference()) continue;

      // we are non-ref if the probability of being non-ref > the emit confidence.
      // the emit confidence is phred-scaled, say 30 => 10^-3.
      // the posterior AF > 0 is log10: -5 => 10^-5
      // we are non-ref if 10^-5 < 10^-3 => -5 < -3
      final boolean isNonRef =
          AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0);

      // if the most likely AC is not 0, then this is a good alternate allele to use
      if (isNonRef) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
        bestGuessIsRef = false;
      }
      // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
      else if (UAC.GenotypingMode
          == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
      }
    }

    final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());

    // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
    final double phredScaledConfidence =
        Math.abs(
            !bestGuessIsRef
                    || UAC.GenotypingMode
                        == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
                            .GENOTYPE_GIVEN_ALLELES
                ? -10 * AFresult.getLog10PosteriorOfAFEq0()
                : -10 * AFresult.getLog10PosteriorOfAFGT0());

    // return a null call if we don't pass the confidence cutoff or the most likely allele frequency
    // is zero
    if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
        && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) {
      // technically, at this point our confidence in a reference call isn't accurately estimated
      //  because it didn't take into account samples with no data, so let's get a better estimate
      return limitedContext
          ? null
          : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
    }

    // start constructing the resulting VC
    final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
    final VariantContextBuilder builder =
        new VariantContextBuilder(
            "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
    builder.log10PError(phredScaledConfidence / -10.0);
    if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter);

    // create the genotypes
    final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy);
    builder.genotypes(genotypes);

    // print out stats if we have a writer
    if (verboseWriter != null && !limitedContext)
      printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);

    // *** note that calculating strand bias involves overwriting data structures, so we do that
    // last
    final HashMap<String, Object> attributes = new HashMap<String, Object>();

    // inherit attributed from input vc if requested
    if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes());
    // if the site was downsampled, record that fact
    if (!limitedContext && rawContext.hasPileupBeenDownsampled())
      attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);

    if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED)
      attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());

    // add the MLE AC and AF annotations
    if (alleleCountsofMLE.size() > 0) {
      attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
      final int AN = builder.make().getCalledChrCount();
      final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
      // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT
      // is ./. but the exact model may arbitrarily choose an AC>1)
      for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN));
      attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
    }

    if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) {
      // final boolean DEBUG_SLOD = false;

      // the overall lod
      // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
      double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);

      List<Allele> allAllelesToUse = builder.make().getAlleles();

      // the forward lod
      VariantContext vcForward =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.FORWARD,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
      // double[] normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ",
      // forwardLog10PofF=" + forwardLog10PofF);

      // the reverse lod
      VariantContext vcReverse =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.REVERSE,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
      // normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ",
      // reverseLog10PofF=" + reverseLog10PofF);

      double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
      double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
      // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" +
      // reverseLod);

      // strand score is max bias between forward and reverse strands
      double strandScore = Math.max(forwardLod, reverseLod);
      // rescale by a factor of 10
      strandScore *= 10.0;
      // logger.debug(String.format("SLOD=%f", strandScore));

      if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore);
    }

    // finish constructing the resulting VC
    builder.attributes(attributes);
    VariantContext vcCall = builder.make();

    // if we are subsetting alleles (either because there were too many or because some were not
    // polymorphic)
    // then we may need to trim the alleles (because the original VariantContext may have had to pad
    // at the end).
    if (myAlleles.size() != vc.getAlleles().size()
        && !limitedContext) // limitedContext callers need to handle allele trimming on their own to
                            // keep their perReadAlleleLikelihoodMap alleles in sync
    vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);

    if (annotationEngine != null
        && !limitedContext) { // limitedContext callers need to handle annotations on their own by
                              // calling their own annotationEngine
      // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
      final ReadBackedPileup pileup = rawContext.getBasePileup();
      stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

      vcCall =
          annotationEngine.annotateContext(
              tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
    }

    return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
  }
  /**
   * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper.
   *
   * <p>If allSamples != null, then the output variantCallContext is guarenteed to contain a
   * genotype for every sample in allSamples. If it's null there's no such guarentee. Providing this
   * argument is critical when the resulting calls will be written to a VCF file.
   *
   * @param tracker the meta data tracker
   * @param refContext the reference base
   * @param rawContext contextual information around the locus
   * @param allSamples set of all sample names that we might call (i.e., those in the VCF header)
   * @return the VariantCallContext object
   */
  public List<VariantCallContext> calculateLikelihoodsAndGenotypes(
      final RefMetaDataTracker tracker,
      final ReferenceContext refContext,
      final AlignmentContext rawContext,
      final Set<String> allSamples) {
    final List<VariantCallContext> results = new ArrayList<VariantCallContext>(2);

    final List<GenotypeLikelihoodsCalculationModel.Model> models =
        getGLModelsToUse(tracker, refContext, rawContext);

    final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>
        perReadAlleleLikelihoodMap =
            new HashMap<
                String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>();

    if (models.isEmpty()) {
      results.add(
          UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES
                  && UAC.GenotypingMode
                      == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
              ? generateEmptyContext(tracker, refContext, null, rawContext)
              : null);
    } else {
      for (final GenotypeLikelihoodsCalculationModel.Model model : models) {
        perReadAlleleLikelihoodMap.clear();
        final Map<String, AlignmentContext> stratifiedContexts =
            getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
        if (stratifiedContexts == null) {
          results.add(
              UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES
                      && UAC.GenotypingMode
                          == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
                              .GENOTYPE_GIVEN_ALLELES
                  ? generateEmptyContext(tracker, refContext, null, rawContext)
                  : null);
        } else {
          final VariantContext vc =
              calculateLikelihoods(
                  tracker,
                  refContext,
                  stratifiedContexts,
                  AlignmentContextUtils.ReadOrientation.COMPLETE,
                  null,
                  true,
                  model,
                  perReadAlleleLikelihoodMap);
          if (vc != null)
            results.add(
                calculateGenotypes(
                    tracker,
                    refContext,
                    rawContext,
                    stratifiedContexts,
                    vc,
                    model,
                    true,
                    perReadAlleleLikelihoodMap));
        }
      }
    }

    return results;
  }