Ejemplo n.º 1
0
 /**
  * return a deep copy of this collection.
  *
  * @return a new GenomeLocSortedSet, identical to the current GenomeLocSortedSet.
  */
 public GenomeLocSortedSet clone() {
   GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser);
   for (GenomeLoc loc : this.mArray) {
     // ensure a deep copy
     ret.mArray.add(
         genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop()));
   }
   return ret;
 }
Ejemplo n.º 2
0
  public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
    LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
    Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
    Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();

    // initialize the stacks
    toProcess.addAll(mArray);
    Collections.reverse(toProcess);
    toExclude.addAll(toRemoveSet.mArray);
    Collections.reverse(toExclude);

    int i = 0;
    while (!toProcess.empty()) { // while there's still stuff to process
      if (toExclude.empty()) {
        good.addAll(toProcess); // no more excludes, all the processing stuff is good
        break;
      }

      GenomeLoc p = toProcess.peek();
      GenomeLoc e = toExclude.peek();

      if (p.overlapsP(e)) {
        toProcess.pop();
        for (GenomeLoc newP : p.subtract(e)) toProcess.push(newP);
      } else if (p.compareContigs(e) < 0) {
        good.add(toProcess.pop()); // p is now good
      } else if (p.compareContigs(e) > 0) {
        toExclude.pop(); // e can't effect anything
      } else if (p.getStop() < e.getStart()) {
        good.add(toProcess.pop()); // p stops before e starts, p is good
      } else if (e.getStop() < p.getStart()) {
        toExclude.pop(); // p starts after e stops, e is done
      } else {
        throw new ReviewedStingException("BUG: unexpected condition: p=" + p + ", e=" + e);
      }

      if (i++ % 10000 == 0) logger.debug("removeRegions operation: i = " + i);
    }

    return createSetFromList(genomeLocParser, good);
  }
Ejemplo n.º 3
0
  @Test
  public void deleteSuperRegion() {
    GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 10, 20);
    GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 70, 100);
    mSortedSet.add(g);
    mSortedSet.addRegion(e);
    assertTrue(mSortedSet.size() == 2);
    // now delete a region
    GenomeLoc d = genomeLocParser.createGenomeLoc(contigOneName, 15, 75);
    mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, d));
    Iterator<GenomeLoc> iter = mSortedSet.iterator();
    GenomeLoc loc = iter.next();
    assertTrue(loc.getStart() == 10);
    assertTrue(loc.getStop() == 14);
    assertTrue(loc.getContigIndex() == 1);

    loc = iter.next();
    assertTrue(loc.getStart() == 76);
    assertTrue(loc.getStop() == 100);
    assertTrue(loc.getContigIndex() == 1);
  }
Ejemplo n.º 4
0
 @Test
 public void deleteSomeByRegion() {
   GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 1, 100);
   mSortedSet.add(e);
   for (int x = 1; x < 50; x++) {
     GenomeLoc del = genomeLocParser.createGenomeLoc(contigOneName, x, x);
     mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, del));
   }
   assertTrue(!mSortedSet.isEmpty());
   assertTrue(mSortedSet.size() == 1);
   GenomeLoc loc = mSortedSet.iterator().next();
   assertTrue(loc.getStop() == 100);
   assertTrue(loc.getStart() == 50);
 }
Ejemplo n.º 5
0
 @Test
 public void fromSequenceDictionary() {
   mSortedSet =
       GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary());
   // we should have sequence
   assertTrue(mSortedSet.size() == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES);
   int seqNumber = 0;
   for (GenomeLoc loc : mSortedSet) {
     assertTrue(loc.getStart() == 1);
     assertTrue(loc.getStop() == GenomeLocSortedSetUnitTest.CHROMOSOME_SIZE);
     assertTrue(loc.getContigIndex() == seqNumber);
     ++seqNumber;
   }
   assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES);
 }
Ejemplo n.º 6
0
 @Test
 public void mergingOverlappingAbove() {
   GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);
   GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 49, 100);
   assertTrue(mSortedSet.size() == 0);
   mSortedSet.add(g);
   assertTrue(mSortedSet.size() == 1);
   mSortedSet.addRegion(e);
   assertTrue(mSortedSet.size() == 1);
   Iterator<GenomeLoc> iter = mSortedSet.iterator();
   GenomeLoc loc = iter.next();
   assertEquals(loc.getStart(), 0);
   assertEquals(loc.getStop(), 100);
   assertEquals(loc.getContigIndex(), 1);
 }
Ejemplo n.º 7
0
  @DataProvider(name = "GetOverlapping")
  public Object[][] makeGetOverlappingTest() throws Exception {
    final GenomeLocParser genomeLocParser =
        new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference)));

    List<Object[]> tests = new ArrayList<Object[]>();

    final GenomeLoc prev1 = genomeLocParser.createGenomeLoc("19", 1, 10);
    final GenomeLoc prev2 = genomeLocParser.createGenomeLoc("19", 20, 50);
    final GenomeLoc post1 = genomeLocParser.createGenomeLoc("21", 1, 10);
    final GenomeLoc post2 = genomeLocParser.createGenomeLoc("21", 20, 50);

    final int chr20Length = genomeLocParser.getContigs().getSequence("20").getSequenceLength();
    for (final int regionStart : Arrays.asList(1, 10, chr20Length - 10, chr20Length)) {
      for (final int regionSize : Arrays.asList(1, 10, 100)) {
        final GenomeLoc region =
            genomeLocParser.createGenomeLocOnContig("20", regionStart, regionStart + regionSize);
        final GenomeLoc spanning =
            genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, region.getStop() + 10);
        final GenomeLoc before_into =
            genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, regionStart + 1);
        final GenomeLoc middle =
            genomeLocParser.createGenomeLocOnContig("20", regionStart + 1, regionStart + 2);
        final GenomeLoc middle_past =
            genomeLocParser.createGenomeLocOnContig(
                "20", region.getStop() - 1, region.getStop() + 10);

        final List<GenomeLoc> potentials = new LinkedList<GenomeLoc>();
        potentials.add(region);
        if (spanning != null) potentials.add(spanning);
        if (before_into != null) potentials.add(before_into);
        if (middle != null) potentials.add(middle);
        if (middle_past != null) potentials.add(middle_past);

        for (final int n : Arrays.asList(1, 2, 3)) {
          for (final List<GenomeLoc> regions : Utils.makePermutations(potentials, n, false)) {
            tests.add(new Object[] {new GenomeLocSortedSet(genomeLocParser, regions), region});
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1)), region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1)), region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1, post2)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, post1)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(
                      genomeLocParser, Utils.append(regions, prev1, prev2, post1, post2)),
                  region
                });
          }
        }
      }
    }

    return tests.toArray(new Object[][] {});
  }
Ejemplo n.º 8
0
  /**
   * Main entry function to calculate genotypes of a given VC with corresponding GL's
   *
   * @param tracker Tracker
   * @param refContext Reference context
   * @param rawContext Raw context
   * @param stratifiedContexts Stratified alignment contexts
   * @param vc Input VC
   * @param model GL calculation model
   * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc
   * @return VC with assigned genotypes
   */
  public VariantCallContext calculateGenotypes(
      final RefMetaDataTracker tracker,
      final ReferenceContext refContext,
      final AlignmentContext rawContext,
      Map<String, AlignmentContext> stratifiedContexts,
      final VariantContext vc,
      final GenotypeLikelihoodsCalculationModel.Model model,
      final boolean inheritAttributesFromInputVC,
      final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>
          perReadAlleleLikelihoodMap) {

    boolean limitedContext =
        tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;

    // initialize the data for this thread if that hasn't been done yet
    if (afcm.get() == null) {
      afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
    }

    // estimate our confidence in a reference call and return
    if (vc.getNSamples() == 0) {
      if (limitedContext) return null;
      return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
          ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0)
          : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
    }

    AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));

    // is the most likely frequency conformation AC=0 for all alternate alleles?
    boolean bestGuessIsRef = true;

    // determine which alternate alleles have AF>0
    final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
    final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size());
    myAlleles.add(vc.getReference());
    for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) {
      final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
      if (alternateAllele.isReference()) continue;

      // we are non-ref if the probability of being non-ref > the emit confidence.
      // the emit confidence is phred-scaled, say 30 => 10^-3.
      // the posterior AF > 0 is log10: -5 => 10^-5
      // we are non-ref if 10^-5 < 10^-3 => -5 < -3
      final boolean isNonRef =
          AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0);

      // if the most likely AC is not 0, then this is a good alternate allele to use
      if (isNonRef) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
        bestGuessIsRef = false;
      }
      // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
      else if (UAC.GenotypingMode
          == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
      }
    }

    final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());

    // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
    final double phredScaledConfidence =
        Math.abs(
            !bestGuessIsRef
                    || UAC.GenotypingMode
                        == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
                            .GENOTYPE_GIVEN_ALLELES
                ? -10 * AFresult.getLog10PosteriorOfAFEq0()
                : -10 * AFresult.getLog10PosteriorOfAFGT0());

    // return a null call if we don't pass the confidence cutoff or the most likely allele frequency
    // is zero
    if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
        && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) {
      // technically, at this point our confidence in a reference call isn't accurately estimated
      //  because it didn't take into account samples with no data, so let's get a better estimate
      return limitedContext
          ? null
          : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
    }

    // start constructing the resulting VC
    final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
    final VariantContextBuilder builder =
        new VariantContextBuilder(
            "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
    builder.log10PError(phredScaledConfidence / -10.0);
    if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter);

    // create the genotypes
    final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy);
    builder.genotypes(genotypes);

    // print out stats if we have a writer
    if (verboseWriter != null && !limitedContext)
      printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);

    // *** note that calculating strand bias involves overwriting data structures, so we do that
    // last
    final HashMap<String, Object> attributes = new HashMap<String, Object>();

    // inherit attributed from input vc if requested
    if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes());
    // if the site was downsampled, record that fact
    if (!limitedContext && rawContext.hasPileupBeenDownsampled())
      attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);

    if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED)
      attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());

    // add the MLE AC and AF annotations
    if (alleleCountsofMLE.size() > 0) {
      attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
      final int AN = builder.make().getCalledChrCount();
      final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
      // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT
      // is ./. but the exact model may arbitrarily choose an AC>1)
      for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN));
      attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
    }

    if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) {
      // final boolean DEBUG_SLOD = false;

      // the overall lod
      // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
      double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);

      List<Allele> allAllelesToUse = builder.make().getAlleles();

      // the forward lod
      VariantContext vcForward =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.FORWARD,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
      // double[] normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ",
      // forwardLog10PofF=" + forwardLog10PofF);

      // the reverse lod
      VariantContext vcReverse =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.REVERSE,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
      // normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ",
      // reverseLog10PofF=" + reverseLog10PofF);

      double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
      double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
      // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" +
      // reverseLod);

      // strand score is max bias between forward and reverse strands
      double strandScore = Math.max(forwardLod, reverseLod);
      // rescale by a factor of 10
      strandScore *= 10.0;
      // logger.debug(String.format("SLOD=%f", strandScore));

      if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore);
    }

    // finish constructing the resulting VC
    builder.attributes(attributes);
    VariantContext vcCall = builder.make();

    // if we are subsetting alleles (either because there were too many or because some were not
    // polymorphic)
    // then we may need to trim the alleles (because the original VariantContext may have had to pad
    // at the end).
    if (myAlleles.size() != vc.getAlleles().size()
        && !limitedContext) // limitedContext callers need to handle allele trimming on their own to
                            // keep their perReadAlleleLikelihoodMap alleles in sync
    vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);

    if (annotationEngine != null
        && !limitedContext) { // limitedContext callers need to handle annotations on their own by
                              // calling their own annotationEngine
      // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
      final ReadBackedPileup pileup = rawContext.getBasePileup();
      stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

      vcCall =
          annotationEngine.annotateContext(
              tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
    }

    return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
  }