Beispiel #1
0
  // protected basic manipulation routines
  private static List<Allele> makeAlleles(Collection<Allele> alleles) {
    final List<Allele> alleleList = new ArrayList<Allele>(alleles.size());

    boolean sawRef = false;
    for (final Allele a : alleles) {
      for (final Allele b : alleleList) {
        if (a.equals(b, true))
          throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
      }

      // deal with the case where the first allele isn't the reference
      if (a.isReference()) {
        if (sawRef)
          throw new IllegalArgumentException(
              "Alleles for a VariantContext must contain at most one reference allele: " + alleles);
        alleleList.add(0, a);
        sawRef = true;
      } else alleleList.add(a);
    }

    if (alleleList.isEmpty())
      throw new IllegalArgumentException(
          "Cannot create a VariantContext with an empty allele list");

    if (alleleList.get(0).isNonReference())
      throw new IllegalArgumentException(
          "Alleles for a VariantContext must contain at least one reference allele: " + alleles);

    return alleleList;
  }
Beispiel #2
0
  private static Type typeOfBiallelicVariant(Allele ref, Allele allele) {
    if (ref.isSymbolic())
      throw new IllegalStateException(
          "Unexpected error: encountered a record with a symbolic reference allele");

    if (allele.isSymbolic()) return Type.SYMBOLIC;

    if (ref.length() == allele.length()) {
      if (allele.length() == 1) return Type.SNP;
      else return Type.MNP;
    }

    // Important note: previously we were checking that one allele is the prefix of the other.
    // However, that's not an
    // appropriate check as can be seen from the following example:
    // REF = CTTA and ALT = C,CT,CA
    // This should be assigned the INDEL type but was being marked as a MIXED type because of the
    // prefix check.
    // In truth, it should be absolutely impossible to return a MIXED type from this method because
    // it simply
    // performs a pairwise comparison of a single alternate allele against the reference allele
    // (whereas the MIXED type
    // is reserved for cases of multiple alternate alleles of different types).  Therefore, if we've
    // reached this point
    // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL.
    return Type.INDEL;

    // old incorrect logic:
    // if (oneIsPrefixOfOther(ref, allele))
    //     return Type.INDEL;
    // else
    //     return Type.MIXED;
  }
  protected void printVerboseData(
      String pos,
      VariantContext vc,
      double PofF,
      double phredScaledConfidence,
      final GenotypeLikelihoodsCalculationModel.Model model) {
    Allele refAllele = null, altAllele = null;
    for (Allele allele : vc.getAlleles()) {
      if (allele.isReference()) refAllele = allele;
      else altAllele = allele;
    }

    for (int i = 0; i <= N; i++) {
      StringBuilder AFline = new StringBuilder("AFINFO\t");
      AFline.append(pos);
      AFline.append("\t");
      AFline.append(refAllele);
      AFline.append("\t");
      if (altAllele != null) AFline.append(altAllele);
      else AFline.append("N/A");
      AFline.append("\t");
      AFline.append(i + "/" + N + "\t");
      AFline.append(String.format("%.2f\t", ((float) i) / N));
      AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
      verboseWriter.println(AFline.toString());
    }

    verboseWriter.println("P(f>0) = " + PofF);
    verboseWriter.println("Qscore = " + phredScaledConfidence);
    verboseWriter.println();
  }
Beispiel #4
0
  @Test
  public void testFixReverseComplementedGenotypes() {

    final Allele refA = Allele.create("A", true);
    final Allele altC = Allele.create("C", false);
    final GenotypesContext originalGenotypes = GenotypesContext.create(3);
    originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make());
    originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make());
    originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make());

    final Allele refT = Allele.create("T", true);
    final Allele altG = Allele.create("G", false);
    final GenotypesContext expectedGenotypes = GenotypesContext.create(3);
    expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make());
    expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make());
    expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make());

    final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(2);
    reverseComplementAlleleMap.put(refA, refT);
    reverseComplementAlleleMap.put(altC, altG);
    final GenotypesContext actualGenotypes =
        LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap);

    for (final String sample : Arrays.asList("homref", "het", "homvar")) {
      final List<Allele> expected = expectedGenotypes.get(sample).getAlleles();
      final List<Allele> actual = actualGenotypes.get(sample).getAlleles();
      Assert.assertEquals(expected.get(0), actual.get(0));
      Assert.assertEquals(expected.get(1), actual.get(1));
    }
  }
Beispiel #5
0
  protected final void printCallInfo(
      final VariantContext vc,
      final double[] log10AlleleFrequencyPriors,
      final long runtimeNano,
      final AFCalcResult result) {
    printCallElement(vc, "type", "ignore", vc.getType());

    int allelei = 0;
    for (final Allele a : vc.getAlleles())
      printCallElement(vc, "allele", allelei++, a.getDisplayString());

    for (final Genotype g : vc.getGenotypes())
      printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());

    for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++)
      printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);

    printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
    printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0());
    printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0());

    for (final Allele allele : result.getAllelesUsedInGenotyping()) {
      if (allele.isNonReference()) {
        printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele));
        printCallElement(
            vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele));
      }
    }

    callReport.flush();
  }
Beispiel #6
0
 public boolean hasSymbolicAlleles() {
   for (final Allele a : getAlleles()) {
     if (a.isSymbolic()) {
       return true;
     }
   }
   return false;
 }
Beispiel #7
0
  /**
   * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding
   * NO_CALLS)
   *
   * @return chromosome count
   */
  public int getCalledChrCount() {
    int n = 0;

    for (final Genotype g : getGenotypes()) {
      for (final Allele a : g.getAlleles()) n += a.isNoCall() ? 0 : 1;
    }

    return n;
  }
Beispiel #8
0
  public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {

    int index = 1;
    for (Allele allele : getAlternateAlleles()) {
      if (allele.equals(targetAllele)) break;
      index++;
    }

    return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
  }
Beispiel #9
0
  public boolean hasAllele(Allele allele, boolean ignoreRefState) {
    if (allele == REF || allele == ALT) // optimization for cached cases
    return true;

    for (Allele a : getAlleles()) {
      if (a.equals(allele, ignoreRefState)) return true;
    }

    return false;
  }
  private Collection<VariantContext> getVariantContexts(
      RefMetaDataTracker tracker, ReferenceContext ref) {

    List<Feature> features = tracker.getValues(variants, ref.getLocus());
    List<VariantContext> VCs = new ArrayList<VariantContext>(features.size());

    for (Feature record : features) {
      if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) {
        // we need to special case the HapMap format because indels aren't handled correctly
        if (record instanceof RawHapMapFeature) {

          // is it an indel?
          RawHapMapFeature hapmap = (RawHapMapFeature) record;
          if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING)
              || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) {
            // get the dbsnp object corresponding to this record (needed to help us distinguish
            // between insertions and deletions)
            VariantContext dbsnpVC = getDbsnp(hapmap.getName());
            if (dbsnpVC == null || dbsnpVC.isMixed()) continue;

            Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
            alleleMap.put(
                RawHapMapFeature.DELETION,
                Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion()));
            alleleMap.put(
                RawHapMapFeature.INSERTION,
                Allele.create(
                    (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1],
                    !dbsnpVC.isSimpleInsertion()));
            hapmap.setActualAlleles(alleleMap);

            // also, use the correct positioning for insertions
            hapmap.updatePosition(dbsnpVC.getStart());

            if (hapmap.getStart() < ref.getWindow().getStart()) {
              logger.warn(
                  "Hapmap record at "
                      + ref.getLocus()
                      + " represents an indel too large to be converted; skipping...");
              continue;
            }
          }
        }

        // ok, we might actually be able to turn this record in a variant context
        VariantContext vc =
            VariantContextAdaptors.toVariantContext(variants.getName(), record, ref);

        if (vc != null) // sometimes the track has odd stuff in it that can't be converted
        VCs.add(vc);
      }
    }

    return VCs;
  }
Beispiel #11
0
  private void validateAlleles() {
    // check alleles
    boolean alreadySeenRef = false, alreadySeenNull = false;
    for (Allele allele : alleles) {
      // make sure there's only one reference allele
      if (allele.isReference()) {
        if (alreadySeenRef)
          throw new IllegalArgumentException(
              "BUG: Received two reference tagged alleles in VariantContext "
                  + alleles
                  + " this="
                  + this);
        alreadySeenRef = true;
      }

      if (allele.isNoCall()) {
        throw new IllegalArgumentException(
            "BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this);
      }

      // make sure there's only one null allele
      if (allele.isNull()) {
        if (alreadySeenNull)
          throw new IllegalArgumentException(
              "BUG: Received two null alleles in VariantContext " + alleles + " this=" + this);
        alreadySeenNull = true;
      }
    }

    // make sure there's one reference allele
    if (!alreadySeenRef)
      throw new IllegalArgumentException("No reference allele found in VariantContext");

    //        if ( getType() == Type.INDEL ) {
    //            if ( getReference().length() != (getLocation().size()-1) ) {
    long length = (stop - start) + 1;
    if ((getReference().isNull() && length != 1)
        || (getReference().isNonNull() && (length - getReference().length() > 1))) {
      throw new IllegalStateException(
          "BUG: GenomeLoc "
              + contig
              + ":"
              + start
              + "-"
              + stop
              + " has a size == "
              + length
              + " but the variation reference allele has length "
              + getReference().length()
              + " this = "
              + this);
    }
  }
Beispiel #12
0
  /**
   * Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a
   * list of their sizes
   *
   * @return a list of indel lengths ( null if not of type indel or mixed )
   */
  public List<Integer> getIndelLengths() {
    if (getType() != Type.INDEL && getType() != Type.MIXED) {
      return null;
    }

    List<Integer> lengths = new ArrayList<Integer>();
    for (Allele a : getAlternateAlleles()) {
      lengths.add(a.length() - getReference().length());
    }

    return lengths;
  }
Beispiel #13
0
  private void validateGenotypes() {
    if (this.genotypes == null) throw new IllegalStateException("Genotypes is null");

    for (final Genotype g : this.genotypes) {
      if (g.isAvailable()) {
        for (Allele gAllele : g.getAlleles()) {
          if (!hasAllele(gAllele) && gAllele.isCalled())
            throw new IllegalStateException(
                "Allele in genotype " + gAllele + " not in the variant context " + alleles);
        }
      }
    }
  }
Beispiel #14
0
  /**
   * helper routine for subcontext
   *
   * @param genotypes genotypes
   * @return allele set
   */
  private final Set<Allele> allelesOfGenotypes(Collection<Genotype> genotypes) {
    final Set<Allele> alleles = new HashSet<Allele>();

    boolean addedref = false;
    for (final Genotype g : genotypes) {
      for (final Allele a : g.getAlleles()) {
        addedref = addedref || a.isReference();
        if (a.isCalled()) alleles.add(a);
      }
    }
    if (!addedref) alleles.add(getReference());

    return alleles;
  }
 private ReverseClippingPositionTestProvider(
     final int expectedClip, final String ref, final String... alleles) {
   super(ReverseClippingPositionTestProvider.class);
   this.ref = ref;
   for (final String allele : alleles) this.alleles.add(Allele.create(allele));
   this.expectedClip = expectedClip;
 }
  private VariantCallContext generateEmptyContext(
      RefMetaDataTracker tracker,
      ReferenceContext ref,
      Map<String, AlignmentContext> stratifiedContexts,
      AlignmentContext rawContext) {
    VariantContext vc;
    if (UAC.GenotypingMode
        == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
      VariantContext vcInput =
          UnifiedGenotyperEngine.getVCFromAllelesRod(
              tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
      if (vcInput == null) return null;
      vc =
          new VariantContextBuilder(
                  "UG_call",
                  ref.getLocus().getContig(),
                  vcInput.getStart(),
                  vcInput.getEnd(),
                  vcInput.getAlleles())
              .make();
    } else {
      // deal with bad/non-standard reference bases
      if (!Allele.acceptableAlleleBases(new byte[] {ref.getBase()})) return null;

      Set<Allele> alleles = new HashSet<Allele>();
      alleles.add(Allele.create(ref.getBase(), true));
      vc =
          new VariantContextBuilder(
                  "UG_call",
                  ref.getLocus().getContig(),
                  ref.getLocus().getStart(),
                  ref.getLocus().getStart(),
                  alleles)
              .make();
    }

    if (annotationEngine != null) {
      // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
      final ReadBackedPileup pileup = rawContext.getBasePileup();
      stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

      vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc);
    }

    return new VariantCallContext(vc, false);
  }
  private static final boolean hasPLIncompatibleAlleles(
      final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
    final Iterator<Allele> it1 = alleleSet1.iterator();
    final Iterator<Allele> it2 = alleleSet2.iterator();

    while (it1.hasNext() && it2.hasNext()) {
      final Allele a1 = it1.next();
      final Allele a2 = it2.next();
      if (!a1.equals(a2)) return true;
    }

    // by this point, at least one of the iterators is empty.  All of the elements
    // we've compared are equal up until this point.  But it's possible that the
    // sets aren't the same size, which is indicated by the test below.  If they
    // are of the same size, though, the sets are compatible
    return it1.hasNext() || it2.hasNext();
  }
Beispiel #18
0
  static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
    for (final Genotype gt1 : vc1.getGenotypes()) {
      Genotype gt2 = vc2.getGenotype(gt1.getSampleName());

      List<Allele> site1Alleles = gt1.getAlleles();
      List<Allele> site2Alleles = gt2.getAlleles();

      Iterator<Allele> all2It = site2Alleles.iterator();
      for (Allele all1 : site1Alleles) {
        Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()

        if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
        return true;
      }
    }

    return false;
  }
    private RepeatDetectorTest(
        boolean isTrueRepeat, String ref, String refAlleleString, String... altAlleleStrings) {
      super(RepeatDetectorTest.class);
      this.ref = "N" + ref; // add a dummy base for the event here
      this.isTrueRepeat = isTrueRepeat;

      List<Allele> alleles = new LinkedList<Allele>();
      final Allele refAllele = Allele.create(refAlleleString, true);
      alleles.add(refAllele);
      for (final String altString : altAlleleStrings) {
        final Allele alt = Allele.create(altString, false);
        alleles.add(alt);
      }

      VariantContextBuilder builder =
          new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles);
      this.vc = builder.make();
    }
Beispiel #20
0
    private Allele ensureMergedAllele(
        Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
      AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
      Allele mergedAllele = mergedAlleles.get(all12);

      if (mergedAllele == null) {
        byte[] bases1 = all1.getBases();
        byte[] bases2 = all2.getBases();

        byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
        System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
        if (intermediateBases != null)
          System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
        System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);

        mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
        mergedAlleles.put(all12, mergedAllele);
      }

      return mergedAllele;
    }
  /**
   * Returns a context identical to this with the REF and ALT alleles reverse complemented.
   *
   * @param vc variant context
   * @return new vc
   */
  public static VariantContext reverseComplement(VariantContext vc) {
    // create a mapping from original allele to reverse complemented allele
    HashMap<Allele, Allele> alleleMap = new HashMap<Allele, Allele>(vc.getAlleles().size());
    for (Allele originalAllele : vc.getAlleles()) {
      Allele newAllele;
      if (originalAllele.isNoCall() || originalAllele.isNull()) newAllele = originalAllele;
      else
        newAllele =
            Allele.create(
                BaseUtils.simpleReverseComplement(originalAllele.getBases()),
                originalAllele.isReference());
      alleleMap.put(originalAllele, newAllele);
    }

    // create new Genotype objects
    GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples());
    for (final Genotype genotype : vc.getGenotypes()) {
      List<Allele> newAlleles = new ArrayList<Allele>();
      for (Allele allele : genotype.getAlleles()) {
        Allele newAllele = alleleMap.get(allele);
        if (newAllele == null) newAllele = Allele.NO_CALL;
        newAlleles.add(newAllele);
      }
      newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles));
    }

    return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make();
  }
Beispiel #22
0
  public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
    if (reference == null) return;

    // don't validate if we're a complex event
    if (!isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference())) {
      throw new TribbleException.InternalCodecException(
          String.format(
              "the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s",
              getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
    }

    // we also need to validate the padding base for simple indels
    if (hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase)) {
      throw new TribbleException.InternalCodecException(
          String.format(
              "the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s",
              getChr(),
              getStart(),
              (char) paddedRefBase.byteValue(),
              (char) getReferenceBaseForIndel().byteValue()));
    }
  }
Beispiel #23
0
  static boolean doubleAllelesSegregatePerfectlyAmongSamples(
      VariantContext vc1, VariantContext vc2) {
    // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including
    // reference):
    Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
    Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();

    // Note the segregation of the alleles for the reference genome:
    allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
    allele2ToAllele1.put(vc2.getReference(), vc1.getReference());

    // Note the segregation of the alleles for each sample (and check that it is consistent with the
    // reference and all previous samples).
    for (final Genotype gt1 : vc1.getGenotypes()) {
      Genotype gt2 = vc2.getGenotype(gt1.getSampleName());

      List<Allele> site1Alleles = gt1.getAlleles();
      List<Allele> site2Alleles = gt2.getAlleles();

      Iterator<Allele> all2It = site2Alleles.iterator();
      for (Allele all1 : site1Alleles) {
        Allele all2 = all2It.next();

        Allele all1To2 = allele1ToAllele2.get(all1);
        if (all1To2 == null) allele1ToAllele2.put(all1, all2);
        else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
        return false;

        Allele all2To1 = allele2ToAllele1.get(all2);
        if (all2To1 == null) allele2ToAllele1.put(all2, all1);
        else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
        return false;
      }
    }

    return true;
  }
 /**
  * Outputs all intervals that are behind the current reference locus
  *
  * @param refLocus the current reference locus
  * @param refBase the reference allele
  */
 private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
   // output any intervals that were finished
   final List<GenomeLoc> toRemove = new LinkedList<>();
   for (GenomeLoc key : intervalMap.keySet()) {
     if (key.isBefore(refLocus)) {
       final IntervalStratification intervalStats = intervalMap.get(key);
       outputStatsToVCF(intervalStats, Allele.create(refBase, true));
       if (hasMissingLoci(intervalStats)) {
         outputMissingInterval(intervalStats);
       }
       toRemove.add(key);
     }
   }
   for (GenomeLoc key : toRemove) {
     intervalMap.remove(key);
   }
 }
  private static Allele determineReferenceAllele(List<VariantContext> VCs) {
    Allele ref = null;

    for (VariantContext vc : VCs) {
      Allele myRef = vc.getReference();
      if (ref == null || ref.length() < myRef.length()) ref = myRef;
      else if (ref.length() == myRef.length() && !ref.equals(myRef))
        throw new UserException.BadInput(
            String.format(
                "The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s",
                vc.getChr(), vc.getStart(), ref, myRef));
    }

    return ref;
  }
  @BeforeSuite
  public void setup() {
    final File referenceFile = new File(b37KGReference);
    try {
      IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile);
      genomeLocParser = new GenomeLocParser(seq);
    } catch (FileNotFoundException ex) {
      throw new UserException.CouldNotReadInputFile(referenceFile, ex);
    }

    // alleles
    Aref = Allele.create("A", true);
    Cref = Allele.create("C", true);
    T = Allele.create("T");
    C = Allele.create("C");
    ATC = Allele.create("ATC");
    ATCATC = Allele.create("ATCATC");
  }
Beispiel #27
0
  /**
   * Read in a list of ExactCall objects from reader, keeping only those with starts in startsToKeep
   * or all sites (if this is empty)
   *
   * @param reader a just-opened reader sitting at the start of the file
   * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should
   *     be kept
   * @param parser a genome loc parser to create genome locs
   * @return a list of ExactCall objects in reader
   * @throws IOException
   */
  public static List<ExactCall> readExactLog(
      final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser)
      throws IOException {
    if (reader == null) throw new IllegalArgumentException("reader cannot be null");
    if (startsToKeep == null) throw new IllegalArgumentException("startsToKeep cannot be null");
    if (parser == null) throw new IllegalArgumentException("GenomeLocParser cannot be null");

    List<ExactCall> calls = new LinkedList<ExactCall>();

    // skip the header line
    reader.readLine();

    // skip the first "type" line
    reader.readLine();

    while (true) {
      final VariantContextBuilder builder = new VariantContextBuilder();
      final List<Allele> alleles = new ArrayList<Allele>();
      final List<Genotype> genotypes = new ArrayList<Genotype>();
      final double[] posteriors = new double[2];
      final double[] priors = MathUtils.normalizeFromLog10(new double[] {0.5, 0.5}, true);
      final List<Integer> mle = new ArrayList<Integer>();
      final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>();
      long runtimeNano = -1;

      GenomeLoc currentLoc = null;
      while (true) {
        final String line = reader.readLine();
        if (line == null) return calls;

        final String[] parts = line.split("\t");
        final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]);
        final String variable = parts[1];
        final String key = parts[2];
        final String value = parts[3];

        if (currentLoc == null) currentLoc = lineLoc;

        if (variable.equals("type")) {
          if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) {
            builder.alleles(alleles);
            final int stop = currentLoc.getStart() + alleles.get(0).length() - 1;
            builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
            builder.genotypes(genotypes);
            final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[] {}));
            final AFCalcResult result =
                new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele);
            calls.add(new ExactCall(builder.make(), runtimeNano, result));
          }
          break;
        } else if (variable.equals("allele")) {
          final boolean isRef = key.equals("0");
          alleles.add(Allele.create(value, isRef));
        } else if (variable.equals("PL")) {
          final GenotypeBuilder gb = new GenotypeBuilder(key);
          gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs());
          genotypes.add(gb.make());
        } else if (variable.equals("log10PosteriorOfAFEq0")) {
          posteriors[0] = Double.valueOf(value);
        } else if (variable.equals("log10PosteriorOfAFGt0")) {
          posteriors[1] = Double.valueOf(value);
        } else if (variable.equals("MLE")) {
          mle.add(Integer.valueOf(value));
        } else if (variable.equals("pNonRefByAllele")) {
          final Allele a = Allele.create(key);
          log10pNonRefByAllele.put(a, Double.valueOf(value));
        } else if (variable.equals("runtime.nano")) {
          runtimeNano = Long.valueOf(value);
        } else {
          // nothing to do
        }
      }
    }
  }
Beispiel #28
0
  public void writeBeagleOutput(
      VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
    GenomeLoc currentLoc =
        VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
    StringBuffer beagleOut = new StringBuffer();

    String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart());
    beagleOut.append(marker);
    if (markers != null)
      markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
    for (Allele allele : preferredVC.getAlleles()) {
      String bglPrintString;
      if (allele.isNoCall() || allele.isNull()) bglPrintString = "-";
      else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele

      beagleOut.append(String.format("%s ", bglPrintString));
      if (markers != null) markers.append(bglPrintString).append("\t");
    }
    if (markers != null) markers.append("\n");

    GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
    GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
    for (String sample : samples) {
      boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

      Genotype genotype;
      boolean isValidation;
      // use sample as key into genotypes structure
      if (preferredGenotypes.containsSample(sample)) {
        genotype = preferredGenotypes.get(sample);
        isValidation = isValidationSite;
      } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) {
        genotype = otherGenotypes.get(sample);
        isValidation = !isValidationSite;
      } else {
        // there is magically no genotype for this sample.
        throw new StingException(
            "Sample "
                + sample
                + " arose with no genotype in variant or validation VCF. This should never happen.");
      }

      /*
       * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
       */
      double[] log10Likelihoods = null;
      if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) {
        log10Likelihoods = genotype.getLikelihoods().getAsVector();

        // see if we need to randomly mask out genotype in this position.
        if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) {
          // we are masking out this genotype
          log10Likelihoods =
              isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
        }

        if (isMaleOnChrX) {
          log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
        }
      }
      /** otherwise, use the prior uniformly */
      else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) {
        // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called
        // genotype
        // is confident.  This is useful for Hapmap and 1KG release VCFs.
        double AA = (1.0 - prior) / 2.0;
        double AB = (1.0 - prior) / 2.0;
        double BB = (1.0 - prior) / 2.0;

        if (genotype.isHomRef()) {
          AA = prior;
        } else if (genotype.isHet()) {
          AB = prior;
        } else if (genotype.isHomVar()) {
          BB = prior;
        }

        log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB});
      } else {
        log10Likelihoods =
            isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
      }

      writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
    }

    beagleWriter.println(beagleOut.toString());
  }
/**
 * Analyze coverage distribution and validate read mates per interval and per sample
 *
 * <p>This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It
 * analyzes each sample independently and aggregates results over intervals of interest.
 * Low-coverage regions can be identified by using e.g. FindCoveredIntervals with the -uncovered
 * argument.
 *
 * <h3>Input</h3>
 *
 * <ul>
 *   <li>A reference file
 *   <li>one or more input BAMs
 *   <li>One or more intervals
 * </ul>
 *
 * <h3>Output</h3>
 *
 * <p>A modified VCF detailing each interval by sample and information for each interval according
 * to the thresholds used. Interval information includes GC Content, average interval depth,
 * callable status among others. If you use the --missing option, you can get as a second output a
 * intervals file with the loci that have missing data. This file can then be used as input to
 * QualifyMissingIntervals for full qualification and interpretation of why the data is missing.
 *
 * <h3>Usage example</h3>
 *
 * <pre>
 *    java -jar GenomeAnalysisTK.jar
 *              -T DiagnoseTargets \
 *              -R reference.fasta \
 *              -I sample1.bam \
 *              -I sample2.bam \
 *              -I sample3.bam \
 *              -L intervals.interval_list \
 *              -o output.vcf
 *  </pre>
 *
 * @author Mauricio Carneiro, Roger Zurawicki
 * @since 5/8/12
 */
@DocumentedGATKFeature(
    groupName = HelpConstants.DOCS_CAT_QC,
    extraDocs = {CommandLineGATK.class})
@By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL)
@Downsample(by = DownsampleType.NONE)
public class DiagnoseTargets extends LocusWalker<Long, Long> {

  @Output(doc = "File to which interval statistics should be written")
  private VariantContextWriter vcfWriter = null;

  @ArgumentCollection private ThresHolder thresholds = new ThresHolder();

  private Map<GenomeLoc, IntervalStratification> intervalMap =
      null; // maps each interval => statistics
  private PeekableIterator<GenomeLoc>
      intervalListIterator; // an iterator to go over all the intervals provided as we traverse the
                            // genome
  private Set<String> samples = null; // all the samples being processed
  private static final Allele SYMBOLIC_ALLELE =
      Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
  private static final Allele UNCOVERED_ALLELE =
      Allele.create(
          "A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
  private static final int INITIAL_HASH_SIZE =
      50; // enough room for potential overlapping intervals plus recently finished intervals

  @Override
  public void initialize() {
    super.initialize();

    if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
      throw new UserException(
          "This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");

    intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE);
    intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator());

    // get all of the unique sample names for the VCF Header
    samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
    vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples));

    // pre load all the statistics classes because it is costly to operate on the JVM and we only
    // want to do it once.
    loadAllPlugins(thresholds);
  }

  @Override
  public Long map(
      final RefMetaDataTracker tracker,
      final ReferenceContext ref,
      final AlignmentContext context) {
    GenomeLoc refLocus = ref.getLocus();

    // process and remove any intervals in the map that are don't overlap the current locus anymore
    // and add all new intervals that may overlap this reference locus
    addNewOverlappingIntervals(refLocus);
    outputFinishedIntervals(refLocus, ref.getBase());

    // at this point, all intervals in intervalMap overlap with this locus, so update all of them
    for (IntervalStratification intervalStratification : intervalMap.values())
      intervalStratification.addLocus(context, ref);

    return 1L;
  }

  @Override
  public Long reduceInit() {
    return 0L;
  }

  /**
   * Not sure what we are going to do here
   *
   * @param value result of the map.
   * @param sum accumulator for the reduce.
   * @return a long
   */
  @Override
  public Long reduce(Long value, Long sum) {
    return sum + value;
  }

  /**
   * Process all remaining intervals
   *
   * @param result number of loci processed by the walker
   */
  @Override
  public void onTraversalDone(final Long result) {
    for (GenomeLoc interval : intervalMap.keySet())
      outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);

    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null) {
      outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }

    if (thresholds.missingTargets != null) {
      thresholds.missingTargets.close();
    }
  }

  /**
   * Outputs all intervals that are behind the current reference locus
   *
   * @param refLocus the current reference locus
   * @param refBase the reference allele
   */
  private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
    // output any intervals that were finished
    final List<GenomeLoc> toRemove = new LinkedList<>();
    for (GenomeLoc key : intervalMap.keySet()) {
      if (key.isBefore(refLocus)) {
        final IntervalStratification intervalStats = intervalMap.get(key);
        outputStatsToVCF(intervalStats, Allele.create(refBase, true));
        if (hasMissingLoci(intervalStats)) {
          outputMissingInterval(intervalStats);
        }
        toRemove.add(key);
      }
    }
    for (GenomeLoc key : toRemove) {
      intervalMap.remove(key);
    }
  }

  /**
   * Adds all intervals that overlap the current reference locus to the intervalMap
   *
   * @param refLocus the current reference locus
   */
  private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
    GenomeLoc interval = intervalListIterator.peek();
    while (interval != null && !interval.isPast(refLocus)) {
      intervalMap.put(interval, createIntervalStatistic(interval));
      intervalListIterator.next();
      interval = intervalListIterator.peek();
    }
  }

  /**
   * Takes the interval, finds it in the stash, prints it to the VCF
   *
   * @param stats The statistics of the interval
   * @param refAllele the reference allele
   */
  private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
    GenomeLoc interval = stats.getInterval();

    final List<Allele> alleles = new ArrayList<>();
    final Map<String, Object> attributes = new HashMap<>();
    final ArrayList<Genotype> genotypes = new ArrayList<>();

    for (String sample : samples) {
      final GenotypeBuilder gb = new GenotypeBuilder(sample);

      SampleStratification sampleStat = stats.getSampleStatistics(sample);
      gb.attribute(
          GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY,
          sampleStat.averageCoverage(interval.size()));
      gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
      gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
      gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));

      genotypes.add(gb.make());
    }
    alleles.add(refAllele);
    alleles.add(SYMBOLIC_ALLELE);
    VariantContextBuilder vcb =
        new VariantContextBuilder(
            "DiagnoseTargets",
            interval.getContig(),
            interval.getStart(),
            interval.getStop(),
            alleles);

    vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR);
    vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true)));

    attributes.put(VCFConstants.END_KEY, interval.getStop());
    attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
    attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent());

    vcb = vcb.attributes(attributes);
    vcb = vcb.genotypes(genotypes);

    vcfWriter.add(vcb.make());
  }

  private boolean hasMissingStatuses(AbstractStratification stats) {
    return !stats.callableStatuses().isEmpty();
  }

  private boolean hasMissingLoci(final IntervalStratification stats) {
    return thresholds.missingTargets != null && hasMissingStatuses(stats);
  }

  private void outputMissingInterval(final IntervalStratification stats) {
    final GenomeLoc interval = stats.getInterval();
    final boolean missing[] = new boolean[interval.size()];
    Arrays.fill(missing, true);
    for (AbstractStratification sample : stats.getElements()) {
      if (hasMissingStatuses(sample)) {
        int pos = 0;
        for (AbstractStratification locus : sample.getElements()) {
          if (locus.callableStatuses().isEmpty()) {
            missing[pos] = false;
          }
          pos++;
        }
      }
    }
    int start = -1;
    boolean insideMissing = false;
    for (int i = 0; i < missing.length; i++) {
      if (missing[i] && !insideMissing) {
        start = interval.getStart() + i;
        insideMissing = true;
      } else if (!missing[i] && insideMissing) {
        final int stop = interval.getStart() + i - 1;
        outputMissingInterval(interval.getContig(), start, stop);
        insideMissing = false;
      }
    }
    if (insideMissing) {
      outputMissingInterval(interval.getContig(), start, interval.getStop());
    }
  }

  private void outputMissingInterval(final String contig, final int start, final int stop) {
    final PrintStream out = thresholds.missingTargets;
    out.println(String.format("%s:%d-%d", contig, start, stop));
  }

  /**
   * Function that process a set of statuses into strings
   *
   * @param statuses the set of statuses to be converted
   * @return a matching set of strings
   */
  private List<String> statusToStrings(
      Iterable<CallableStatus> statuses, final boolean isInfoField) {
    List<String> output = new LinkedList<>();

    for (CallableStatus status : statuses)
      if (isInfoField || status != CallableStatus.PASS) output.add(status.name());

    return output;
  }

  private IntervalStratification createIntervalStatistic(GenomeLoc interval) {
    return new IntervalStratification(samples, interval, thresholds);
  }

  protected static void loadAllPlugins(final ThresHolder thresholds) {
    for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) {
      try {
        final LocusMetric stats = (LocusMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.locusMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) {
      try {
        final SampleMetric stats = (SampleMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.sampleMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }

    for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) {
      try {
        final IntervalMetric stats = (IntervalMetric) stat.newInstance();
        stats.initialize(thresholds);
        thresholds.intervalMetricList.add(stats);
      } catch (Exception e) {
        throw new DynamicClassResolutionException(stat, e);
      }
    }
  }

  /**
   * Gets the header lines for the VCF writer
   *
   * @return A set of VCF header lines
   */
  private static Set<VCFHeaderLine> getHeaderInfo() {
    Set<VCFHeaderLine> headerLines = new HashSet<>();

    // INFO fields for overall data
    headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY));
    headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY));
    headerLines.add(
        new VCFInfoHeaderLine(
            "Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));

    // FORMAT fields for each genotype
    headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
    headerLines.add(
        GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI));
    headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI));

    // FILTER fields
    for (CallableStatus stat : CallableStatus.values())
      headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));

    return headerLines;
  }
}
Beispiel #30
0
 /**
  * @return The allele sharing the same bases as this byte[], or null if no such allele is present.
  */
 public Allele getAllele(byte[] allele) {
   return Allele.getMatchingAllele(getAlleles(), allele);
 }