Beispiel #1
0
  /**
   * Test the reads according to an independently derived context.
   *
   * @param view
   * @param range
   * @param reads
   */
  @Override
  protected void testReadsInContext(
      LocusView view, List<GenomeLoc> range, List<GATKSAMRecord> reads) {
    AllLocusView allLocusView = (AllLocusView) view;

    // TODO: Should skip over loci not in the given range.
    GenomeLoc firstLoc = range.get(0);
    GenomeLoc lastLoc = range.get(range.size() - 1);
    GenomeLoc bounds =
        genomeLocParser.createGenomeLoc(
            firstLoc.getContig(), firstLoc.getStart(), lastLoc.getStop());

    for (int i = bounds.getStart(); i <= bounds.getStop(); i++) {
      GenomeLoc site = genomeLocParser.createGenomeLoc("chr1", i);
      AlignmentContext locusContext = allLocusView.next();
      Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect");
      int expectedReadsAtSite = 0;

      for (GATKSAMRecord read : reads) {
        if (genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) {
          Assert.assertTrue(
              locusContext.getReads().contains(read),
              "Target locus context does not contain reads");
          expectedReadsAtSite++;
        }
      }

      Assert.assertEquals(
          locusContext.getReads().size(),
          expectedReadsAtSite,
          "Found wrong number of reads at site");
    }
  }
 private GenomeLoc createIntervalAfter(GenomeLoc interval) {
   int contigLimit =
       getToolkit()
           .getSAMFileHeader()
           .getSequenceDictionary()
           .getSequence(interval.getContigIndex())
           .getSequenceLength();
   int start = Math.min(interval.getStop() + 1, contigLimit);
   int stop = Math.min(interval.getStop() + expandInterval, contigLimit);
   return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
 }
Beispiel #3
0
 /**
  * return a deep copy of this collection.
  *
  * @return a new GenomeLocSortedSet, identical to the current GenomeLocSortedSet.
  */
 public GenomeLocSortedSet clone() {
   GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser);
   for (GenomeLoc loc : this.mArray) {
     // ensure a deep copy
     ret.mArray.add(
         genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop()));
   }
   return ret;
 }
Beispiel #4
0
  public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
    LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
    Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
    Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();

    // initialize the stacks
    toProcess.addAll(mArray);
    Collections.reverse(toProcess);
    toExclude.addAll(toRemoveSet.mArray);
    Collections.reverse(toExclude);

    int i = 0;
    while (!toProcess.empty()) { // while there's still stuff to process
      if (toExclude.empty()) {
        good.addAll(toProcess); // no more excludes, all the processing stuff is good
        break;
      }

      GenomeLoc p = toProcess.peek();
      GenomeLoc e = toExclude.peek();

      if (p.overlapsP(e)) {
        toProcess.pop();
        for (GenomeLoc newP : p.subtract(e)) toProcess.push(newP);
      } else if (p.compareContigs(e) < 0) {
        good.add(toProcess.pop()); // p is now good
      } else if (p.compareContigs(e) > 0) {
        toExclude.pop(); // e can't effect anything
      } else if (p.getStop() < e.getStart()) {
        good.add(toProcess.pop()); // p stops before e starts, p is good
      } else if (e.getStop() < p.getStart()) {
        toExclude.pop(); // p starts after e stops, e is done
      } else {
        throw new ReviewedStingException("BUG: unexpected condition: p=" + p + ", e=" + e);
      }

      if (i++ % 10000 == 0) logger.debug("removeRegions operation: i = " + i);
    }

    return createSetFromList(genomeLocParser, good);
  }
  @Test
  public void deleteSuperRegion() {
    GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 10, 20);
    GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 70, 100);
    mSortedSet.add(g);
    mSortedSet.addRegion(e);
    assertTrue(mSortedSet.size() == 2);
    // now delete a region
    GenomeLoc d = genomeLocParser.createGenomeLoc(contigOneName, 15, 75);
    mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, d));
    Iterator<GenomeLoc> iter = mSortedSet.iterator();
    GenomeLoc loc = iter.next();
    assertTrue(loc.getStart() == 10);
    assertTrue(loc.getStop() == 14);
    assertTrue(loc.getContigIndex() == 1);

    loc = iter.next();
    assertTrue(loc.getStart() == 76);
    assertTrue(loc.getStop() == 100);
    assertTrue(loc.getContigIndex() == 1);
  }
Beispiel #6
0
  /**
   * Determines what is the position of the read in relation to the interval. Note: This function
   * uses the UNCLIPPED ENDS of the reads for the comparison.
   *
   * @param read the read
   * @param interval the interval
   * @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
   */
  public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(
      GATKSAMRecord read, GenomeLoc interval) {

    int sStart = read.getSoftStart();
    int sStop = read.getSoftEnd();
    int uStart = read.getUnclippedStart();
    int uStop = read.getUnclippedEnd();

    if (!read.getReferenceName().equals(interval.getContig()))
      return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;
    else if (uStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;
    else if (uStart > interval.getStop()) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;
    else if (sStop < interval.getStart()) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT;
    else if (sStart > interval.getStop())
      return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT;
    else if ((sStart >= interval.getStart()) && (sStop <= interval.getStop()))
      return ReadAndIntervalOverlap.OVERLAP_CONTAINED;
    else if ((sStart < interval.getStart()) && (sStop > interval.getStop()))
      return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;
    else if ((sStart < interval.getStart())) return ReadAndIntervalOverlap.OVERLAP_LEFT;
    else return ReadAndIntervalOverlap.OVERLAP_RIGHT;
  }
 @Test
 public void deleteSomeByRegion() {
   GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 1, 100);
   mSortedSet.add(e);
   for (int x = 1; x < 50; x++) {
     GenomeLoc del = genomeLocParser.createGenomeLoc(contigOneName, x, x);
     mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(genomeLocParser, del));
   }
   assertTrue(!mSortedSet.isEmpty());
   assertTrue(mSortedSet.size() == 1);
   GenomeLoc loc = mSortedSet.iterator().next();
   assertTrue(loc.getStop() == 100);
   assertTrue(loc.getStart() == 50);
 }
 @Test
 public void fromSequenceDictionary() {
   mSortedSet =
       GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary());
   // we should have sequence
   assertTrue(mSortedSet.size() == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES);
   int seqNumber = 0;
   for (GenomeLoc loc : mSortedSet) {
     assertTrue(loc.getStart() == 1);
     assertTrue(loc.getStop() == GenomeLocSortedSetUnitTest.CHROMOSOME_SIZE);
     assertTrue(loc.getContigIndex() == seqNumber);
     ++seqNumber;
   }
   assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES);
 }
 @Test
 public void mergingOverlappingAbove() {
   GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);
   GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 49, 100);
   assertTrue(mSortedSet.size() == 0);
   mSortedSet.add(g);
   assertTrue(mSortedSet.size() == 1);
   mSortedSet.addRegion(e);
   assertTrue(mSortedSet.size() == 1);
   Iterator<GenomeLoc> iter = mSortedSet.iterator();
   GenomeLoc loc = iter.next();
   assertEquals(loc.getStart(), 0);
   assertEquals(loc.getStop(), 100);
   assertEquals(loc.getContigIndex(), 1);
 }
Beispiel #10
0
  private boolean overlapsKnownCNV(VariantContext cnv) {
    if (knownCNVs != null) {
      final GenomeLoc loc =
          getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true);
      IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());

      final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt =
          intervalTree.overlappers(loc.getStart(), loc.getStop());
      while (nodeIt.hasNext()) {
        final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue());
        if (overlapP > MIN_CNV_OVERLAP) return true;
      }
    }

    return false;
  }
  public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(
      final IntervalBinding<Feature> intervals) {
    final Map<String, IntervalTree<GenomeLoc>> byContig =
        new HashMap<String, IntervalTree<GenomeLoc>>();

    final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

    // set up the map from contig -> interval tree
    for (final String contig : getContigNames())
      byContig.put(contig, new IntervalTree<GenomeLoc>());

    for (final GenomeLoc loc : locs) {
      byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
    }

    return byContig;
  }
Beispiel #12
0
  /**
   * Returns a new GenomeLoc that represents the region between the endpoints of this and that.
   * Requires that this and that GenomeLoc are both mapped.
   */
  @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"})
  @Ensures("result != null")
  public GenomeLoc endpointSpan(GenomeLoc that) throws ReviewedStingException {
    if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
      throw new ReviewedStingException("Cannot get endpoint span for unmerged genome locs");
    }

    if (!this.getContig().equals(that.getContig())) {
      throw new ReviewedStingException(
          "Cannot get endpoint span for genome locs on different contigs");
    }

    return new GenomeLoc(
        getContig(),
        this.contigIndex,
        Math.min(getStart(), that.getStart()),
        Math.max(getStop(), that.getStop()));
  }
Beispiel #13
0
  /**
   * Returns a new GenomeLoc that represents the entire span of this and that. Requires that this
   * and that GenomeLoc are contiguous and both mapped
   */
  @Requires({"that != null", "isUnmapped(this) == isUnmapped(that)"})
  @Ensures("result != null")
  public GenomeLoc merge(GenomeLoc that) throws ReviewedStingException {
    if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
      if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
        throw new ReviewedStingException("Tried to merge a mapped and an unmapped genome loc");
      return UNMAPPED;
    }

    if (!(this.contiguousP(that))) {
      throw new ReviewedStingException("The two genome loc's need to be contigous");
    }

    return new GenomeLoc(
        getContig(),
        this.contigIndex,
        Math.min(getStart(), that.getStart()),
        Math.max(getStop(), that.getStop()));
  }
Beispiel #14
0
  @Requires("that != null")
  @Ensures("result != null")
  public GenomeLoc intersect(GenomeLoc that) throws ReviewedStingException {
    if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
      if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
        throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
      return UNMAPPED;
    }

    if (!(this.overlapsP(that))) {
      throw new ReviewedStingException(
          "GenomeLoc::intersect(): The two genome loc's need to overlap");
    }

    return new GenomeLoc(
        getContig(),
        this.contigIndex,
        Math.max(getStart(), that.getStart()),
        Math.min(getStop(), that.getStop()));
  }
  private T callWalkerMapOnActiveRegions(
      final ActiveRegionWalker<M, T> walker,
      T sum,
      final int minStart,
      final String currentContig) {
    // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can
    // unload those regions and process them
    // TODO can implement parallel traversal here
    while (workQueue.peek() != null) {
      final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
      if (extendedLoc.getStop() < minStart
          || (currentContig != null
              && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
        final ActiveRegion activeRegion = workQueue.remove();
        sum = processActiveRegion(activeRegion, myReads, workQueue, sum, walker);
      } else {
        break;
      }
    }

    return sum;
  }
Beispiel #16
0
  public Iterable<Shard> createShardsOverIntervals(
      final SAMDataSource readsDataSource,
      final GenomeLocSortedSet intervals,
      final int maxShardSize) {
    List<Shard> shards = new ArrayList<Shard>();

    for (GenomeLoc interval : intervals) {
      while (interval.size() > maxShardSize) {
        shards.add(
            new LocusShard(
                intervals.getGenomeLocParser(),
                readsDataSource,
                Collections.singletonList(
                    intervals
                        .getGenomeLocParser()
                        .createGenomeLoc(
                            interval.getContig(),
                            interval.getStart(),
                            interval.getStart() + maxShardSize - 1)),
                null));
        interval =
            intervals
                .getGenomeLocParser()
                .createGenomeLoc(
                    interval.getContig(), interval.getStart() + maxShardSize, interval.getStop());
      }
      shards.add(
          new LocusShard(
              intervals.getGenomeLocParser(),
              readsDataSource,
              Collections.singletonList(interval),
              null));
    }

    return shards;
  }
Beispiel #17
0
 @Requires("that != null")
 public final boolean containsP(GenomeLoc that) {
   return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop();
 }
  @DataProvider(name = "GetOverlapping")
  public Object[][] makeGetOverlappingTest() throws Exception {
    final GenomeLocParser genomeLocParser =
        new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference)));

    List<Object[]> tests = new ArrayList<Object[]>();

    final GenomeLoc prev1 = genomeLocParser.createGenomeLoc("19", 1, 10);
    final GenomeLoc prev2 = genomeLocParser.createGenomeLoc("19", 20, 50);
    final GenomeLoc post1 = genomeLocParser.createGenomeLoc("21", 1, 10);
    final GenomeLoc post2 = genomeLocParser.createGenomeLoc("21", 20, 50);

    final int chr20Length = genomeLocParser.getContigs().getSequence("20").getSequenceLength();
    for (final int regionStart : Arrays.asList(1, 10, chr20Length - 10, chr20Length)) {
      for (final int regionSize : Arrays.asList(1, 10, 100)) {
        final GenomeLoc region =
            genomeLocParser.createGenomeLocOnContig("20", regionStart, regionStart + regionSize);
        final GenomeLoc spanning =
            genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, region.getStop() + 10);
        final GenomeLoc before_into =
            genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, regionStart + 1);
        final GenomeLoc middle =
            genomeLocParser.createGenomeLocOnContig("20", regionStart + 1, regionStart + 2);
        final GenomeLoc middle_past =
            genomeLocParser.createGenomeLocOnContig(
                "20", region.getStop() - 1, region.getStop() + 10);

        final List<GenomeLoc> potentials = new LinkedList<GenomeLoc>();
        potentials.add(region);
        if (spanning != null) potentials.add(spanning);
        if (before_into != null) potentials.add(before_into);
        if (middle != null) potentials.add(middle);
        if (middle_past != null) potentials.add(middle_past);

        for (final int n : Arrays.asList(1, 2, 3)) {
          for (final List<GenomeLoc> regions : Utils.makePermutations(potentials, n, false)) {
            tests.add(new Object[] {new GenomeLocSortedSet(genomeLocParser, regions), region});
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1)), region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1)), region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1, post2)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, post1)),
                  region
                });
            tests.add(
                new Object[] {
                  new GenomeLocSortedSet(
                      genomeLocParser, Utils.append(regions, prev1, prev2, post1, post2)),
                  region
                });
          }
        }
      }
    }

    return tests.toArray(new Object[][] {});
  }
  @Override
  public T traverse(
      final ActiveRegionWalker<M, T> walker, final LocusShardDataProvider dataProvider, T sum) {
    logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));

    final LocusView locusView = getLocusView(walker, dataProvider);
    final GenomeLocSortedSet initialIntervals = engine.getIntervals();

    final LocusReferenceView referenceView = new LocusReferenceView(walker, dataProvider);
    final int activeRegionExtension =
        walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
    final int maxRegionSize =
        walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();

    if (locusView
        .hasNext()) { // trivial optimization to avoid unnecessary processing when there's nothing
                      // here at all
      int minStart = Integer.MAX_VALUE;
      ActivityProfile profile =
          new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions());

      ReferenceOrderedView referenceOrderedDataView =
          getReferenceOrderedView(walker, dataProvider, locusView);

      // We keep processing while the next reference location is within the interval
      GenomeLoc prevLoc = null;
      while (locusView.hasNext()) {
        final AlignmentContext locus = locusView.next();
        GenomeLoc location = locus.getLocation();

        if (prevLoc != null) {
          // fill in the active / inactive labels from the stop of the previous location to the
          // start of this location
          // TODO refactor to separate function
          for (int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++) {
            final GenomeLoc fakeLoc =
                engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
            if (initialIntervals == null || initialIntervals.overlaps(fakeLoc)) {
              profile.add(
                  fakeLoc,
                  new ActivityProfileResult(
                      walker.hasPresetActiveRegions()
                              && walker.presetActiveRegions.overlaps(fakeLoc)
                          ? 1.0
                          : 0.0));
            }
          }
        }

        dataProvider.getShard().getReadMetrics().incrementNumIterations();

        // create reference context. Note that if we have a pileup of "extended events", the context
        // will
        // hold the (longest) stretch of deleted reference bases (if deletions are present in the
        // pileup).
        final ReferenceContext refContext = referenceView.getReferenceContext(location);

        // Iterate forward to get all reference ordered data covering this location
        final RefMetaDataTracker tracker =
            referenceOrderedDataView.getReferenceOrderedDataAtLocus(
                locus.getLocation(), refContext);

        // Call the walkers isActive function for this locus and add them to the list to be
        // integrated later
        if (initialIntervals == null || initialIntervals.overlaps(location)) {
          profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
        }

        // Grab all the previously unseen reads from this pileup and add them to the massive read
        // list
        for (final PileupElement p : locus.getBasePileup()) {
          final GATKSAMRecord read = p.getRead();
          if (!myReads.contains(read)) {
            myReads.add(read);
          }

          // If this is the last pileup for this shard calculate the minimum alignment start so that
          // we know
          // which active regions in the work queue are now safe to process
          minStart = Math.min(minStart, read.getAlignmentStart());
        }

        prevLoc = location;

        printProgress(locus.getLocation());
      }

      updateCumulativeMetrics(dataProvider.getShard());

      // Take the individual isActive calls and integrate them into contiguous active regions and
      // add these blocks of work to the work queue
      // band-pass filter the list of isActive probabilities and turn into active regions
      final ActivityProfile bandPassFiltered = profile.bandPassFilter();
      final List<ActiveRegion> activeRegions =
          bandPassFiltered.createActiveRegions(activeRegionExtension, maxRegionSize);

      // add active regions to queue of regions to process
      // first check if can merge active regions over shard boundaries
      if (!activeRegions.isEmpty()) {
        if (!workQueue.isEmpty()) {
          final ActiveRegion last = workQueue.getLast();
          final ActiveRegion first = activeRegions.get(0);
          if (last.isActive == first.isActive
              && last.getLocation().contiguousP(first.getLocation())
              && last.getLocation().size() + first.getLocation().size() <= maxRegionSize) {
            workQueue.removeLast();
            activeRegions.remove(first);
            workQueue.add(
                new ActiveRegion(
                    last.getLocation().union(first.getLocation()),
                    first.isActive,
                    this.engine.getGenomeLocParser(),
                    activeRegionExtension));
          }
        }
        workQueue.addAll(activeRegions);
      }

      logger.debug(
          "Integrated "
              + profile.size()
              + " isActive calls into "
              + activeRegions.size()
              + " regions.");

      // now go and process all of the active regions
      sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
    }

    return sum;
  }
Beispiel #20
0
  public Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    GenomeLoc cur = context.getLocation();

    if (verbose && showSkipped) {
      for (long i = context.getSkippedBases(); i >= 0; i--) {
        SAMSequenceDictionary dictionary =
            getToolkit().getReferenceDataSource().getReference().getSequenceDictionary();
        SAMSequenceRecord contig = dictionary.getSequence(cur.getContig());
        if (cur.getStop() < contig.getSequenceLength())
          cur = getToolkit().getGenomeLocParser().incPos(cur, 1);
        else
          cur =
              getToolkit()
                  .getGenomeLocParser()
                  .createGenomeLoc(
                      dictionary.getSequence(contig.getSequenceIndex() + 1).getSequenceName(),
                      1,
                      1);
        out.printf("%s: skipped%n", cur);
      }
    }

    long nRodsHere = 0;
    long nTotalBases = 0;

    if (ref == null) {
      // we're getting the last skipped update
      if (verbose)
        out.printf(
            "Last position was %s: skipping %d bases%n",
            context.getLocation(), context.getSkippedBases());
      nRodsHere = -1; // don't update this
      nTotalBases = context.getSkippedBases();
    } else {
      Collection<RODRecordList> rods = new LinkedList<RODRecordList>();
      for (RODRecordList rod : tracker.getBoundRodTracks()) {
        // System.out.printf("Considering rod %s%n", rod);
        if (rod.getLocation().getStart() == context.getLocation().getStart()
            && !rod.getName().equals("interval")) {
          // only consider the first element
          // System.out.printf("adding it%n");
          rods.add(rod);
        }
      }

      nRodsHere = rods.size();

      if (nRodsHere > 0) {
        if (verbose) {
          List<String> names = new ArrayList<String>();
          for (RODRecordList rod : rods) {
            names.add(rod.getName());
          }

          // System.out.printf("context is %s", context.getSkippedBases());
          out.printf(
              "At %s: found %d rod(s) [%s] after skipping %d bases%n",
              context.getLocation(), nRodsHere, Utils.join(",", names), context.getSkippedBases());
        }
      }

      nTotalBases = context.getSkippedBases() + 1;
    }

    return new Datum(nRodsHere, context.getSkippedBases(), nTotalBases);
  }
  /**
   * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority
   * order, if provided. If uniqifySamples is true, the priority order is ignored and names are
   * created by concatenating the VC name with the sample name
   *
   * @param genomeLocParser loc parser
   * @param unsortedVCs collection of unsorted VCs
   * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
   * @param filteredRecordMergeType merge type for filtered records
   * @param genotypeMergeOptions merge option for genotypes
   * @param annotateOrigin should we annotate the set it came from?
   * @param printMessages should we print messages?
   * @param setKey the key name of the set
   * @param filteredAreUncalled are filtered records uncalled?
   * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
   * @return new VariantContext representing the merge of unsortedVCs
   */
  public static VariantContext simpleMerge(
      final GenomeLocParser genomeLocParser,
      final Collection<VariantContext> unsortedVCs,
      final List<String> priorityListOfVCs,
      final FilteredRecordMergeType filteredRecordMergeType,
      final GenotypeMergeType genotypeMergeOptions,
      final boolean annotateOrigin,
      final boolean printMessages,
      final String setKey,
      final boolean filteredAreUncalled,
      final boolean mergeInfoWithMaxAC) {
    if (unsortedVCs == null || unsortedVCs.size() == 0) return null;

    if (annotateOrigin && priorityListOfVCs == null)
      throw new IllegalArgumentException(
          "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts");

    if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE)
      verifyUniqueSampleNames(unsortedVCs);

    List<VariantContext> prepaddedVCs =
        sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
    // Make sure all variant contexts are padded with reference base in case of indels if necessary
    List<VariantContext> VCs = new ArrayList<VariantContext>();

    for (VariantContext vc : prepaddedVCs) {
      // also a reasonable place to remove filtered calls, if needed
      if (!filteredAreUncalled || vc.isNotFiltered())
        VCs.add(createVariantContextWithPaddedAlleles(vc, false));
    }
    if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled
    return null;

    // establish the baseline info from the first VC
    final VariantContext first = VCs.get(0);
    final String name = first.getSource();
    final Allele refAllele = determineReferenceAllele(VCs);

    final Set<Allele> alleles = new LinkedHashSet<Allele>();
    final Set<String> filters = new TreeSet<String>();
    final Map<String, Object> attributes = new TreeMap<String, Object>();
    final Set<String> inconsistentAttributes = new HashSet<String>();
    final Set<String> variantSources =
        new HashSet<
            String>(); // contains the set of sources we found in our set of VCs that are variant
    final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id

    GenomeLoc loc = getLocation(genomeLocParser, first);
    int depth = 0;
    int maxAC = -1;
    final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>();
    double log10PError = 1;
    VariantContext vcWithMaxAC = null;
    GenotypesContext genotypes = GenotypesContext.create();

    // counting the number of filtered and variant VCs
    int nFiltered = 0;

    boolean remapped = false;

    // cycle through and add info from the other VCs, making sure the loc/reference matches

    for (VariantContext vc : VCs) {
      if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) )
      throw new ReviewedStingException(
            "BUG: attempting to merge VariantContexts with different start sites: first="
                + first.toString()
                + " second="
                + vc.toString());

      if (getLocation(genomeLocParser, vc).size() > loc.size())
        loc = getLocation(genomeLocParser, vc); // get the longest location

      nFiltered += vc.isFiltered() ? 1 : 0;
      if (vc.isVariant()) variantSources.add(vc.getSource());

      AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles);
      remapped = remapped || alleleMapping.needsRemapping();

      alleles.addAll(alleleMapping.values());

      mergeGenotypes(
          genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);

      log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1);

      filters.addAll(vc.getFilters());

      //
      // add attributes
      //
      // special case DP (add it up) and ID (just preserve it)
      //
      if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
        depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
      if (vc.hasID()) rsIDs.add(vc.getID());
      if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
        String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
        // lets see if the string contains a , separator
        if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
          List<String> alleleCountArray =
              Arrays.asList(
                  rawAlleleCounts
                      .substring(1, rawAlleleCounts.length() - 1)
                      .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
          for (String alleleCount : alleleCountArray) {
            final int ac = Integer.valueOf(alleleCount.trim());
            if (ac > maxAC) {
              maxAC = ac;
              vcWithMaxAC = vc;
            }
          }
        } else {
          final int ac = Integer.valueOf(rawAlleleCounts);
          if (ac > maxAC) {
            maxAC = ac;
            vcWithMaxAC = vc;
          }
        }
      }

      for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
        String key = p.getKey();
        // if we don't like the key already, don't go anywhere
        if (!inconsistentAttributes.contains(key)) {
          boolean alreadyFound = attributes.containsKey(key);
          Object boundValue = attributes.get(key);
          boolean boundIsMissingValue =
              alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);

          if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) {
            // we found the value but we're inconsistent, put it in the exclude list
            // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue,
            // p.getValue());
            inconsistentAttributes.add(key);
            attributes.remove(key);
          } else if (!alreadyFound || boundIsMissingValue) { // no value
            // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(),
            // p.getValue());
            attributes.put(key, p.getValue());
          }
        }
      }
    }

    // if we have more alternate alleles in the merged VC than in one or more of the
    // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well
    // as allele-dependent attributes like AC,AF
    for (VariantContext vc : VCs) {
      if (vc.alleles.size() == 1) continue;
      if (hasPLIncompatibleAlleles(alleles, vc.alleles)) {
        if (!genotypes.isEmpty())
          logger.warn(
              String.format(
                  "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
                  genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
        genotypes = stripPLs(genotypes);
        // this will remove stale AC,AF attributed from vc
        calculateChromosomeCounts(vc, attributes, true);
        break;
      }
    }

    // take the VC with the maxAC and pull the attributes into a modifiable map
    if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
      attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
    }

    // if at least one record was unfiltered and we want a union, clear all of the filters
    if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
            && nFiltered != VCs.size())
        || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear();

    if (annotateOrigin) { // we care about where the call came from
      String setValue;
      if (nFiltered == 0
          && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered
      setValue = MERGE_INTERSECTION;
      else if (nFiltered == VCs.size()) // everything was filtered out
      setValue = MERGE_FILTER_IN_ALL;
      else if (variantSources.isEmpty()) // everyone was reference
      setValue = MERGE_REF_IN_ALL;
      else {
        LinkedHashSet<String> s = new LinkedHashSet<String>();
        for (VariantContext vc : VCs)
          if (vc.isVariant())
            s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource());
        setValue = Utils.join("-", s);
      }

      if (setKey != null) {
        attributes.put(setKey, setValue);
        if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
          attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource());
        }
      }
    }

    if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));

    final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

    final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
    builder.loc(loc.getContig(), loc.getStart(), loc.getStop());
    builder.alleles(alleles);
    builder.genotypes(genotypes);
    builder.log10PError(log10PError);
    builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);

    // Trim the padded bases of all alleles if necessary
    VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
    if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged);
    return merged;
  }
Beispiel #22
0
 @Requires("that != null")
 public boolean endsAt(GenomeLoc that) {
   return (this.compareContigs(that) == 0) && (this.getStop() == that.getStop());
 }
Beispiel #23
0
 @Requires({"locFirst != null", "locSecond != null", "locSecond.isPast(locFirst)"})
 @Ensures("result >= 0")
 private static int distanceFirstStopToSecondStart(GenomeLoc locFirst, GenomeLoc locSecond) {
   return locSecond.getStart() - locFirst.getStop();
 }
Beispiel #24
0
 /**
  * Tests whether this contig is completely after contig 'that'.
  *
  * @param that Contig to test against.
  * @return true if this contig starts after 'that' ends; false if this is completely before or
  *     overlaps 'that'.
  */
 @Requires("that != null")
 public final boolean isPast(GenomeLoc that) {
   int comparison = this.compareContigs(that);
   return (comparison == 1 || (comparison == 0 && this.getStart() > that.getStop()));
 }
  /**
   * Main entry function to calculate genotypes of a given VC with corresponding GL's
   *
   * @param tracker Tracker
   * @param refContext Reference context
   * @param rawContext Raw context
   * @param stratifiedContexts Stratified alignment contexts
   * @param vc Input VC
   * @param model GL calculation model
   * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc
   * @return VC with assigned genotypes
   */
  public VariantCallContext calculateGenotypes(
      final RefMetaDataTracker tracker,
      final ReferenceContext refContext,
      final AlignmentContext rawContext,
      Map<String, AlignmentContext> stratifiedContexts,
      final VariantContext vc,
      final GenotypeLikelihoodsCalculationModel.Model model,
      final boolean inheritAttributesFromInputVC,
      final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap>
          perReadAlleleLikelihoodMap) {

    boolean limitedContext =
        tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;

    // initialize the data for this thread if that hasn't been done yet
    if (afcm.get() == null) {
      afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
    }

    // estimate our confidence in a reference call and return
    if (vc.getNSamples() == 0) {
      if (limitedContext) return null;
      return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
          ? estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0)
          : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
    }

    AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));

    // is the most likely frequency conformation AC=0 for all alternate alleles?
    boolean bestGuessIsRef = true;

    // determine which alternate alleles have AF>0
    final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
    final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size());
    myAlleles.add(vc.getReference());
    for (int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++) {
      final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
      if (alternateAllele.isReference()) continue;

      // we are non-ref if the probability of being non-ref > the emit confidence.
      // the emit confidence is phred-scaled, say 30 => 10^-3.
      // the posterior AF > 0 is log10: -5 => 10^-5
      // we are non-ref if 10^-5 < 10^-3 => -5 < -3
      final boolean isNonRef =
          AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0);

      // if the most likely AC is not 0, then this is a good alternate allele to use
      if (isNonRef) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
        bestGuessIsRef = false;
      }
      // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
      else if (UAC.GenotypingMode
          == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
        myAlleles.add(alternateAllele);
        alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
      }
    }

    final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());

    // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
    final double phredScaledConfidence =
        Math.abs(
            !bestGuessIsRef
                    || UAC.GenotypingMode
                        == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE
                            .GENOTYPE_GIVEN_ALLELES
                ? -10 * AFresult.getLog10PosteriorOfAFEq0()
                : -10 * AFresult.getLog10PosteriorOfAFGT0());

    // return a null call if we don't pass the confidence cutoff or the most likely allele frequency
    // is zero
    if (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES
        && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef)) {
      // technically, at this point our confidence in a reference call isn't accurately estimated
      //  because it didn't take into account samples with no data, so let's get a better estimate
      return limitedContext
          ? null
          : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
    }

    // start constructing the resulting VC
    final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
    final VariantContextBuilder builder =
        new VariantContextBuilder(
            "UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
    builder.log10PError(phredScaledConfidence / -10.0);
    if (!passesCallThreshold(phredScaledConfidence)) builder.filters(filter);

    // create the genotypes
    final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true, ploidy);
    builder.genotypes(genotypes);

    // print out stats if we have a writer
    if (verboseWriter != null && !limitedContext)
      printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);

    // *** note that calculating strand bias involves overwriting data structures, so we do that
    // last
    final HashMap<String, Object> attributes = new HashMap<String, Object>();

    // inherit attributed from input vc if requested
    if (inheritAttributesFromInputVC) attributes.putAll(vc.getAttributes());
    // if the site was downsampled, record that fact
    if (!limitedContext && rawContext.hasPileupBeenDownsampled())
      attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);

    if (UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED)
      attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());

    // add the MLE AC and AF annotations
    if (alleleCountsofMLE.size() > 0) {
      attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
      final int AN = builder.make().getCalledChrCount();
      final ArrayList<Double> MLEfrequencies = new ArrayList<Double>(alleleCountsofMLE.size());
      // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT
      // is ./. but the exact model may arbitrarily choose an AC>1)
      for (int AC : alleleCountsofMLE) MLEfrequencies.add(Math.min(1.0, (double) AC / (double) AN));
      attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
    }

    if (UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef) {
      // final boolean DEBUG_SLOD = false;

      // the overall lod
      // double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
      double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);

      List<Allele> allAllelesToUse = builder.make().getAlleles();

      // the forward lod
      VariantContext vcForward =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.FORWARD,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
      // double[] normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ",
      // forwardLog10PofF=" + forwardLog10PofF);

      // the reverse lod
      VariantContext vcReverse =
          calculateLikelihoods(
              tracker,
              refContext,
              stratifiedContexts,
              AlignmentContextUtils.ReadOrientation.REVERSE,
              allAllelesToUse,
              false,
              model,
              perReadAlleleLikelihoodMap);
      AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
      // normalizedLog10Posteriors =
      // MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
      double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
      double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
      // if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ",
      // reverseLog10PofF=" + reverseLog10PofF);

      double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
      double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
      // if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" +
      // reverseLod);

      // strand score is max bias between forward and reverse strands
      double strandScore = Math.max(forwardLod, reverseLod);
      // rescale by a factor of 10
      strandScore *= 10.0;
      // logger.debug(String.format("SLOD=%f", strandScore));

      if (!Double.isNaN(strandScore)) attributes.put("SB", strandScore);
    }

    // finish constructing the resulting VC
    builder.attributes(attributes);
    VariantContext vcCall = builder.make();

    // if we are subsetting alleles (either because there were too many or because some were not
    // polymorphic)
    // then we may need to trim the alleles (because the original VariantContext may have had to pad
    // at the end).
    if (myAlleles.size() != vc.getAlleles().size()
        && !limitedContext) // limitedContext callers need to handle allele trimming on their own to
                            // keep their perReadAlleleLikelihoodMap alleles in sync
    vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);

    if (annotationEngine != null
        && !limitedContext) { // limitedContext callers need to handle annotations on their own by
                              // calling their own annotationEngine
      // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
      final ReadBackedPileup pileup = rawContext.getBasePileup();
      stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

      vcCall =
          annotationEngine.annotateContext(
              tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
    }

    return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
  }
  public Allele getLikelihoods(
      RefMetaDataTracker tracker,
      ReferenceContext ref,
      Map<String, AlignmentContext> contexts,
      AlignmentContextUtils.ReadOrientation contextType,
      GenotypePriors priors,
      Map<String, MultiallelicGenotypeLikelihoods> GLs,
      Allele alternateAlleleToUse,
      boolean useBAQedPileup) {

    if (tracker == null) return null;

    GenomeLoc loc = ref.getLocus();
    Allele refAllele, altAllele;
    VariantContext vc = null;

    if (!ref.getLocus().equals(lastSiteVisited)) {
      // starting a new site: clear allele list
      alleleList.clear();
      lastSiteVisited = ref.getLocus();
      indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
      haplotypeMap.clear();

      if (getAlleleListFromVCF) {
        for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) {
          if (vc_input != null
              && allowableTypes.contains(vc_input.getType())
              && ref.getLocus().getStart() == vc_input.getStart()) {
            vc = vc_input;
            break;
          }
        }
        // ignore places where we don't have a variant
        if (vc == null) return null;

        alleleList.clear();
        if (ignoreSNPAllelesWhenGenotypingIndels) {
          // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore
          // it and don't genotype it
          for (Allele a : vc.getAlleles())
            if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
              continue;
            else alleleList.add(a);

        } else {
          for (Allele a : vc.getAlleles()) alleleList.add(a);
        }

      } else {
        alleleList = computeConsensusAlleles(ref, contexts, contextType);
        if (alleleList.isEmpty()) return null;
      }
    }
    // protect against having an indel too close to the edge of a contig
    if (loc.getStart() <= HAPLOTYPE_SIZE) return null;

    // check if there is enough reference window to create haplotypes (can be an issue at end of
    // contigs)
    if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null;
    if (!(priors instanceof DiploidIndelGenotypePriors))
      throw new StingException(
          "Only diploid-based Indel priors are supported in the DINDEL GL model");

    if (alleleList.isEmpty()) return null;

    refAllele = alleleList.get(0);
    altAllele = alleleList.get(1);

    // look for alt allele that has biggest length distance to ref allele
    int maxLenDiff = 0;
    for (Allele a : alleleList) {
      if (a.isNonReference()) {
        int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
        if (lenDiff > maxLenDiff) {
          maxLenDiff = lenDiff;
          altAllele = a;
        }
      }
    }

    final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
    final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1;
    final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;

    haplotypeMap =
        Haplotype.makeHaplotypeListFromAlleles(
            alleleList, loc.getStart(), ref, hsize, numPrefBases);

    // For each sample, get genotype likelihoods based on pileup
    // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with
    // them.
    // initialize the GenotypeLikelihoods
    GLs.clear();

    for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
      AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);

      ReadBackedPileup pileup = null;
      if (context.hasExtendedEventPileup()) pileup = context.getExtendedEventPileup();
      else if (context.hasBasePileup()) pileup = context.getBasePileup();

      if (pileup != null) {
        final double[] genotypeLikelihoods =
            pairModel.computeReadHaplotypeLikelihoods(
                pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());

        GLs.put(
            sample.getKey(),
            new MultiallelicGenotypeLikelihoods(
                sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup)));

        if (DEBUG) {
          System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
          for (int k = 0; k < genotypeLikelihoods.length; k++)
            System.out.format("%1.4f ", genotypeLikelihoods[k]);
          System.out.println();
        }
      }
    }

    return refAllele;
  }
Beispiel #27
0
  @Requires("that != null")
  public final List<GenomeLoc> subtract(final GenomeLoc that) {
    if (GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
      if (!GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
        throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
      return Arrays.asList(UNMAPPED);
    }

    if (!(this.overlapsP(that))) {
      throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap");
    }

    if (equals(that)) {
      return Collections.emptyList();
    } else if (containsP(that)) {
      List<GenomeLoc> l = new ArrayList<GenomeLoc>(2);

      /**
       * we have to create two new region, one for the before part, one for the after The old
       * region: |----------------- old region (g) -------------| |----- to delete (e) ------|
       *
       * <p>product (two new regions): |------| + |--------|
       */
      int afterStop = this.getStop(), afterStart = that.getStop() + 1;
      int beforeStop = that.getStart() - 1, beforeStart = this.getStart();
      if (afterStop - afterStart >= 0) {
        GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop);
        l.add(after);
      }
      if (beforeStop - beforeStart >= 0) {
        GenomeLoc before =
            new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop);
        l.add(before);
      }

      return l;
    } else if (that.containsP(this)) {
      /**
       * e completely contains g, delete g, but keep looking, there may be more regions i.e.:
       * |--------------------- e --------------------| |--- g ---| |---- others ----|
       */
      return Collections.emptyList(); // don't need to do anything
    } else {
      /**
       * otherwise e overlaps some part of g
       *
       * <p>figure out which region occurs first on the genome. I.e., is it: |------------- g
       * ----------| |------------- e ----------|
       *
       * <p>or: |------------- g ----------| |------------ e -----------|
       */
      GenomeLoc n;
      if (that.getStart() < this.getStart()) {
        n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop());
      } else {
        n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1);
      }

      // replace g with the new region
      return Arrays.asList(n);
    }
  }