Exemplo n.º 1
0
  @Test
  public void testAnnotationSet() {
    for (final boolean annotate : Arrays.asList(true, false)) {
      for (final String set : Arrays.asList("set", "combine", "x")) {
        final List<String> priority = Arrays.asList("1", "2");
        VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS);
        VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS);

        final VariantContext merged =
            VariantContextUtils.simpleMerge(
                genomeLocParser,
                Arrays.asList(vc1, vc2),
                priority,
                VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
                VariantContextUtils.GenotypeMergeType.PRIORITIZE,
                annotate,
                false,
                set,
                false,
                false);

        if (annotate)
          Assert.assertEquals(merged.getAttribute(set), VariantContextUtils.MERGE_INTERSECTION);
        else Assert.assertFalse(merged.hasAttribute(set));
      }
    }
  }
Exemplo n.º 2
0
  public void update2(
      VariantContext eval,
      VariantContext comp,
      RefMetaDataTracker tracker,
      ReferenceContext ref,
      AlignmentContext context) {
    if (eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples())) return;

    final Type type = getType(eval);
    if (type == null) return;

    TypeSampleMap titvTable = null;

    // update DP, if possible
    if (eval.hasAttribute(VCFConstants.DEPTH_KEY)) depthPerSample.inc(type, ALL);

    // update counts
    allVariantCounts.inc(type, ALL);

    // type specific calculations
    if (type == Type.SNP && eval.isBiallelic()) {
      titvTable =
          VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample;
      titvTable.inc(type, ALL);
    }

    // novelty calculation
    if (comp != null || (type == Type.CNV && overlapsKnownCNV(eval)))
      knownVariantCounts.inc(type, ALL);

    // per sample metrics
    for (final Genotype g : eval.getGenotypes()) {
      if (!g.isNoCall() && !g.isHomRef()) {
        countsPerSample.inc(type, g.getSampleName());

        // update transition / transversion ratio
        if (titvTable != null) titvTable.inc(type, g.getSampleName());

        if (g.hasDP()) depthPerSample.inc(type, g.getSampleName());
      }
    }
  }
Exemplo n.º 3
0
  //
  // helper routines
  //
  private SiteStatus calcSiteStatus(VariantContext vc) {
    if (vc == null) return SiteStatus.NO_CALL;
    if (vc.isFiltered()) return SiteStatus.FILTERED;
    if (vc.isMonomorphicInSamples()) return SiteStatus.MONO;
    if (vc.hasGenotypes())
      return SiteStatus
          .POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes

    if (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
      int ac = 0;
      if (vc.getNAlleles() > 2) {
        return SiteStatus.POLY;
      } else ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
      return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
    } else {
      return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED
          ? SiteStatus.POLY
          : SiteStatus.NO_CALL; // we can't figure out what to do
    }
  }
Exemplo n.º 4
0
  public String update1(
      VariantContext vc1,
      RefMetaDataTracker tracker,
      ReferenceContext ref,
      AlignmentContext context) {
    nCalledLoci++;

    // Note from Eric:
    // This is really not correct.  What we really want here is a polymorphic vs. monomorphic count
    // (i.e. on the Genotypes).
    // So in order to maintain consistency with the previous implementation (and the intention of
    // the original author), I've
    // added in a proxy check for monomorphic status here.
    // Protect against case when vc only as no-calls too - can happen if we strafity by sample and
    // sample as a single no-call.
    if (vc1.isMonomorphicInSamples()) {
      nRefLoci++;
    } else {
      switch (vc1.getType()) {
        case NO_VARIATION:
          // shouldn't get here
          break;
        case SNP:
          nVariantLoci++;
          nSNPs++;
          if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
          break;
        case MNP:
          nVariantLoci++;
          nMNPs++;
          if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
          break;
        case INDEL:
          nVariantLoci++;
          if (vc1.isSimpleInsertion()) nInsertions++;
          else if (vc1.isSimpleDeletion()) nDeletions++;
          else nComplex++;
          break;
        case MIXED:
          nVariantLoci++;
          nMixed++;
          break;
        case SYMBOLIC:
          nSymbolic++;
          break;
        default:
          throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType());
      }
    }

    String refStr = vc1.getReference().getBaseString().toUpperCase();

    String aaStr =
        vc1.hasAttribute("ANCESTRALALLELE")
            ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase()
            : null;
    //        if (aaStr.equals(".")) {
    //            aaStr = refStr;
    //        }

    // ref  aa  alt  class
    // A    C   A    der homozygote
    // A    C   C    anc homozygote

    // A    A   A    ref homozygote
    // A    A   C
    // A    C   A
    // A    C   C

    for (final Genotype g : vc1.getGenotypes()) {
      final String altStr =
          vc1.getAlternateAlleles().size() > 0
              ? vc1.getAlternateAllele(0).getBaseString().toUpperCase()
              : null;

      switch (g.getType()) {
        case NO_CALL:
          nNoCalls++;
          break;
        case HOM_REF:
          nHomRef++;

          if (aaStr != null && altStr != null && !refStr.equalsIgnoreCase(aaStr)) {
            nHomDerived++;
          }

          break;
        case HET:
          nHets++;
          break;
        case HOM_VAR:
          nHomVar++;

          if (aaStr != null && altStr != null && !altStr.equalsIgnoreCase(aaStr)) {
            nHomDerived++;
          }

          break;
        case MIXED:
          break;
        default:
          throw new ReviewedStingException("BUG: Unexpected genotype type: " + g);
      }
    }

    return null; // we don't capture any interesting sites
  }
Exemplo n.º 5
0
  /**
   * add a record to the file
   *
   * @param vc the Variant Context object
   * @param refBase the ref base used for indels
   * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE
   *     EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER
   *     THE EVENT INSTEAD)
   */
  public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) {
    if (mHeader == null)
      throw new IllegalStateException(
          "The VCF Header must be written before records can be added: " + locationString());

    if (doNotWriteGenotypes) vc = VariantContext.modifyGenotypes(vc, null);

    try {
      vc =
          VariantContext.createVariantContextWithPaddedAlleles(
              vc, refBase, refBaseShouldBeAppliedToEndOfAlleles);

      // if we are doing on the fly indexing, add the record ***before*** we write any bytes
      if (indexer != null) indexer.addFeature(vc, positionalStream.getPosition());

      Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
      alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup

      // CHROM
      mWriter.write(vc.getChr());
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // POS
      mWriter.write(String.valueOf(vc.getStart()));
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // ID
      String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD;
      mWriter.write(ID);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // REF
      alleleMap.put(vc.getReference(), "0");
      String refString = vc.getReference().getDisplayString();
      mWriter.write(refString);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // ALT
      if (vc.isVariant()) {
        Allele altAllele = vc.getAlternateAllele(0);
        alleleMap.put(altAllele, "1");
        String alt = altAllele.getDisplayString();
        mWriter.write(alt);

        for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
          altAllele = vc.getAlternateAllele(i);
          alleleMap.put(altAllele, String.valueOf(i + 1));
          alt = altAllele.getDisplayString();
          mWriter.write(",");
          mWriter.write(alt);
        }
      } else {
        mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
      }
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // QUAL
      if (!vc.hasNegLog10PError()) mWriter.write(VCFConstants.MISSING_VALUE_v4);
      else mWriter.write(getQualValue(vc.getPhredScaledQual()));
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // FILTER
      String filters =
          vc.isFiltered()
              ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()))
              : (filtersWereAppliedToContext || vc.filtersWereApplied()
                  ? VCFConstants.PASSES_FILTERS_v4
                  : VCFConstants.UNFILTERED);
      mWriter.write(filters);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // INFO
      Map<String, String> infoFields = new TreeMap<String, String>();
      for (Map.Entry<String, Object> field : vc.getAttributes().entrySet()) {
        String key = field.getKey();
        if (key.equals(VariantContext.ID_KEY)
            || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)
            || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)
            || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY)) continue;

        String outputValue = formatVCFField(field.getValue());
        if (outputValue != null) infoFields.put(key, outputValue);
      }
      writeInfoString(infoFields);

      // FORMAT
      if (vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)) {
        mWriter.write(VCFConstants.FIELD_SEPARATOR);
        mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, ""));
      } else {
        List<String> genotypeAttributeKeys = new ArrayList<String>();
        if (vc.hasGenotypes()) {
          genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
        } else if (mHeader.hasGenotypingData()) {
          // this needs to be done in case all samples are no-calls
          genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
        }

        if (genotypeAttributeKeys.size() > 0) {
          String genotypeFormatString =
              ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
          mWriter.write(VCFConstants.FIELD_SEPARATOR);
          mWriter.write(genotypeFormatString);

          addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
        }
      }

      mWriter.write("\n");
      mWriter.flush(); // necessary so that writing to an output stream will work
    } catch (IOException e) {
      throw new RuntimeException("Unable to write the VCF object to " + locationString());
    }
  }
  /**
   * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority
   * order, if provided. If uniqifySamples is true, the priority order is ignored and names are
   * created by concatenating the VC name with the sample name
   *
   * @param genomeLocParser loc parser
   * @param unsortedVCs collection of unsorted VCs
   * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
   * @param filteredRecordMergeType merge type for filtered records
   * @param genotypeMergeOptions merge option for genotypes
   * @param annotateOrigin should we annotate the set it came from?
   * @param printMessages should we print messages?
   * @param setKey the key name of the set
   * @param filteredAreUncalled are filtered records uncalled?
   * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
   * @return new VariantContext representing the merge of unsortedVCs
   */
  public static VariantContext simpleMerge(
      final GenomeLocParser genomeLocParser,
      final Collection<VariantContext> unsortedVCs,
      final List<String> priorityListOfVCs,
      final FilteredRecordMergeType filteredRecordMergeType,
      final GenotypeMergeType genotypeMergeOptions,
      final boolean annotateOrigin,
      final boolean printMessages,
      final String setKey,
      final boolean filteredAreUncalled,
      final boolean mergeInfoWithMaxAC) {
    if (unsortedVCs == null || unsortedVCs.size() == 0) return null;

    if (annotateOrigin && priorityListOfVCs == null)
      throw new IllegalArgumentException(
          "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts");

    if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE)
      verifyUniqueSampleNames(unsortedVCs);

    List<VariantContext> prepaddedVCs =
        sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
    // Make sure all variant contexts are padded with reference base in case of indels if necessary
    List<VariantContext> VCs = new ArrayList<VariantContext>();

    for (VariantContext vc : prepaddedVCs) {
      // also a reasonable place to remove filtered calls, if needed
      if (!filteredAreUncalled || vc.isNotFiltered())
        VCs.add(createVariantContextWithPaddedAlleles(vc, false));
    }
    if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled
    return null;

    // establish the baseline info from the first VC
    final VariantContext first = VCs.get(0);
    final String name = first.getSource();
    final Allele refAllele = determineReferenceAllele(VCs);

    final Set<Allele> alleles = new LinkedHashSet<Allele>();
    final Set<String> filters = new TreeSet<String>();
    final Map<String, Object> attributes = new TreeMap<String, Object>();
    final Set<String> inconsistentAttributes = new HashSet<String>();
    final Set<String> variantSources =
        new HashSet<
            String>(); // contains the set of sources we found in our set of VCs that are variant
    final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id

    GenomeLoc loc = getLocation(genomeLocParser, first);
    int depth = 0;
    int maxAC = -1;
    final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>();
    double log10PError = 1;
    VariantContext vcWithMaxAC = null;
    GenotypesContext genotypes = GenotypesContext.create();

    // counting the number of filtered and variant VCs
    int nFiltered = 0;

    boolean remapped = false;

    // cycle through and add info from the other VCs, making sure the loc/reference matches

    for (VariantContext vc : VCs) {
      if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) )
      throw new ReviewedStingException(
            "BUG: attempting to merge VariantContexts with different start sites: first="
                + first.toString()
                + " second="
                + vc.toString());

      if (getLocation(genomeLocParser, vc).size() > loc.size())
        loc = getLocation(genomeLocParser, vc); // get the longest location

      nFiltered += vc.isFiltered() ? 1 : 0;
      if (vc.isVariant()) variantSources.add(vc.getSource());

      AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles);
      remapped = remapped || alleleMapping.needsRemapping();

      alleles.addAll(alleleMapping.values());

      mergeGenotypes(
          genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);

      log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1);

      filters.addAll(vc.getFilters());

      //
      // add attributes
      //
      // special case DP (add it up) and ID (just preserve it)
      //
      if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
        depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
      if (vc.hasID()) rsIDs.add(vc.getID());
      if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
        String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
        // lets see if the string contains a , separator
        if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
          List<String> alleleCountArray =
              Arrays.asList(
                  rawAlleleCounts
                      .substring(1, rawAlleleCounts.length() - 1)
                      .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
          for (String alleleCount : alleleCountArray) {
            final int ac = Integer.valueOf(alleleCount.trim());
            if (ac > maxAC) {
              maxAC = ac;
              vcWithMaxAC = vc;
            }
          }
        } else {
          final int ac = Integer.valueOf(rawAlleleCounts);
          if (ac > maxAC) {
            maxAC = ac;
            vcWithMaxAC = vc;
          }
        }
      }

      for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
        String key = p.getKey();
        // if we don't like the key already, don't go anywhere
        if (!inconsistentAttributes.contains(key)) {
          boolean alreadyFound = attributes.containsKey(key);
          Object boundValue = attributes.get(key);
          boolean boundIsMissingValue =
              alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);

          if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) {
            // we found the value but we're inconsistent, put it in the exclude list
            // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue,
            // p.getValue());
            inconsistentAttributes.add(key);
            attributes.remove(key);
          } else if (!alreadyFound || boundIsMissingValue) { // no value
            // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(),
            // p.getValue());
            attributes.put(key, p.getValue());
          }
        }
      }
    }

    // if we have more alternate alleles in the merged VC than in one or more of the
    // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well
    // as allele-dependent attributes like AC,AF
    for (VariantContext vc : VCs) {
      if (vc.alleles.size() == 1) continue;
      if (hasPLIncompatibleAlleles(alleles, vc.alleles)) {
        if (!genotypes.isEmpty())
          logger.warn(
              String.format(
                  "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
                  genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
        genotypes = stripPLs(genotypes);
        // this will remove stale AC,AF attributed from vc
        calculateChromosomeCounts(vc, attributes, true);
        break;
      }
    }

    // take the VC with the maxAC and pull the attributes into a modifiable map
    if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
      attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
    }

    // if at least one record was unfiltered and we want a union, clear all of the filters
    if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
            && nFiltered != VCs.size())
        || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear();

    if (annotateOrigin) { // we care about where the call came from
      String setValue;
      if (nFiltered == 0
          && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered
      setValue = MERGE_INTERSECTION;
      else if (nFiltered == VCs.size()) // everything was filtered out
      setValue = MERGE_FILTER_IN_ALL;
      else if (variantSources.isEmpty()) // everyone was reference
      setValue = MERGE_REF_IN_ALL;
      else {
        LinkedHashSet<String> s = new LinkedHashSet<String>();
        for (VariantContext vc : VCs)
          if (vc.isVariant())
            s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource());
        setValue = Utils.join("-", s);
      }

      if (setKey != null) {
        attributes.put(setKey, setValue);
        if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
          attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource());
        }
      }
    }

    if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));

    final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

    final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
    builder.loc(loc.getContig(), loc.getStart(), loc.getStop());
    builder.alleles(alleles);
    builder.genotypes(genotypes);
    builder.log10PError(log10PError);
    builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);

    // Trim the padded bases of all alleles if necessary
    VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
    if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged);
    return merged;
  }