public static VariantContext getVCFromAllelesRod(
      RefMetaDataTracker tracker,
      ReferenceContext ref,
      GenomeLoc loc,
      boolean requireSNP,
      Logger logger,
      final RodBinding<VariantContext> allelesBinding) {
    if (tracker == null || ref == null || logger == null) return null;
    VariantContext vc = null;

    // search for usable record
    for (final VariantContext vc_input : tracker.getValues(allelesBinding, loc)) {
      if (vc_input != null && !vc_input.isFiltered() && (!requireSNP || vc_input.isSNP())) {
        if (vc == null) {
          vc = vc_input;
        } else {
          logger.warn(
              "Multiple valid VCF records detected in the alleles input file at site "
                  + ref.getLocus()
                  + ", only considering the first record");
        }
      }
    }

    return vc;
  }
Beispiel #2
0
  static boolean mergeIntoMNPvalidationCheck(
      GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
    GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
    GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);

    if (!loc1.onSameContig(loc2))
      throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");

    if (!loc1.isBefore(loc2))
      throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2");

    if (vc1.isFiltered() || vc2.isFiltered()) return false;

    if (!vc1.getSampleNames()
        .equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets
    return false;

    if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
      return false;

    return true;
  }
Beispiel #3
0
  //
  // helper routines
  //
  private SiteStatus calcSiteStatus(VariantContext vc) {
    if (vc == null) return SiteStatus.NO_CALL;
    if (vc.isFiltered()) return SiteStatus.FILTERED;
    if (vc.isMonomorphicInSamples()) return SiteStatus.MONO;
    if (vc.hasGenotypes())
      return SiteStatus
          .POLY; // must be polymorphic if isMonomorphicInSamples was false and there are genotypes

    if (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
      int ac = 0;
      if (vc.getNAlleles() > 2) {
        return SiteStatus.POLY;
      } else ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
      return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
    } else {
      return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED
          ? SiteStatus.POLY
          : SiteStatus.NO_CALL; // we can't figure out what to do
    }
  }
 public static boolean canBeOutputToBeagle(VariantContext v) {
   return v != null && !v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
 }
Beispiel #5
0
  /**
   * add a record to the file
   *
   * @param vc the Variant Context object
   * @param refBase the ref base used for indels
   * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE
   *     EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER
   *     THE EVENT INSTEAD)
   */
  public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) {
    if (mHeader == null)
      throw new IllegalStateException(
          "The VCF Header must be written before records can be added: " + locationString());

    if (doNotWriteGenotypes) vc = VariantContext.modifyGenotypes(vc, null);

    try {
      vc =
          VariantContext.createVariantContextWithPaddedAlleles(
              vc, refBase, refBaseShouldBeAppliedToEndOfAlleles);

      // if we are doing on the fly indexing, add the record ***before*** we write any bytes
      if (indexer != null) indexer.addFeature(vc, positionalStream.getPosition());

      Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
      alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup

      // CHROM
      mWriter.write(vc.getChr());
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // POS
      mWriter.write(String.valueOf(vc.getStart()));
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // ID
      String ID = vc.hasID() ? vc.getID() : VCFConstants.EMPTY_ID_FIELD;
      mWriter.write(ID);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // REF
      alleleMap.put(vc.getReference(), "0");
      String refString = vc.getReference().getDisplayString();
      mWriter.write(refString);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // ALT
      if (vc.isVariant()) {
        Allele altAllele = vc.getAlternateAllele(0);
        alleleMap.put(altAllele, "1");
        String alt = altAllele.getDisplayString();
        mWriter.write(alt);

        for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
          altAllele = vc.getAlternateAllele(i);
          alleleMap.put(altAllele, String.valueOf(i + 1));
          alt = altAllele.getDisplayString();
          mWriter.write(",");
          mWriter.write(alt);
        }
      } else {
        mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
      }
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // QUAL
      if (!vc.hasNegLog10PError()) mWriter.write(VCFConstants.MISSING_VALUE_v4);
      else mWriter.write(getQualValue(vc.getPhredScaledQual()));
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // FILTER
      String filters =
          vc.isFiltered()
              ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()))
              : (filtersWereAppliedToContext || vc.filtersWereApplied()
                  ? VCFConstants.PASSES_FILTERS_v4
                  : VCFConstants.UNFILTERED);
      mWriter.write(filters);
      mWriter.write(VCFConstants.FIELD_SEPARATOR);

      // INFO
      Map<String, String> infoFields = new TreeMap<String, String>();
      for (Map.Entry<String, Object> field : vc.getAttributes().entrySet()) {
        String key = field.getKey();
        if (key.equals(VariantContext.ID_KEY)
            || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)
            || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)
            || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY)) continue;

        String outputValue = formatVCFField(field.getValue());
        if (outputValue != null) infoFields.put(key, outputValue);
      }
      writeInfoString(infoFields);

      // FORMAT
      if (vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY)) {
        mWriter.write(VCFConstants.FIELD_SEPARATOR);
        mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, ""));
      } else {
        List<String> genotypeAttributeKeys = new ArrayList<String>();
        if (vc.hasGenotypes()) {
          genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
        } else if (mHeader.hasGenotypingData()) {
          // this needs to be done in case all samples are no-calls
          genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
        }

        if (genotypeAttributeKeys.size() > 0) {
          String genotypeFormatString =
              ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
          mWriter.write(VCFConstants.FIELD_SEPARATOR);
          mWriter.write(genotypeFormatString);

          addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
        }
      }

      mWriter.write("\n");
      mWriter.flush(); // necessary so that writing to an output stream will work
    } catch (IOException e) {
      throw new RuntimeException("Unable to write the VCF object to " + locationString());
    }
  }
Beispiel #6
0
  /**
   * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for
   * printing)
   *
   * @param tracker the ROD tracker
   * @param ref reference information
   * @param context alignment info
   * @return 1 if the record was printed to the output file, 0 if otherwise
   */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    if (tracker == null) return 0;

    Collection<VariantContext> vcs =
        tracker.getValues(variantCollection.variants, context.getLocation());

    if (vcs == null || vcs.size() == 0) {
      return 0;
    }

    for (VariantContext vc : vcs) {
      if (MENDELIAN_VIOLATIONS) {
        boolean foundMV = false;
        for (MendelianViolation mv : mvSet) {
          if (mv.isViolation(vc)) {
            foundMV = true;
            // System.out.println(vc.toString());
            if (outMVFile != null)
              outMVFileStream.format(
                  "MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, "
                      + "childG=%s childGL=%s\n",
                  vc.getChr(),
                  vc.getStart(),
                  vc.getReference().getDisplayString(),
                  vc.getAlternateAllele(0).getDisplayString(),
                  vc.getChromosomeCount(vc.getAlternateAllele(0)),
                  mv.getSampleMom(),
                  mv.getSampleDad(),
                  mv.getSampleChild(),
                  vc.getGenotype(mv.getSampleMom()).toBriefString(),
                  vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
                  vc.getGenotype(mv.getSampleDad()).toBriefString(),
                  vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
                  vc.getGenotype(mv.getSampleChild()).toBriefString(),
                  vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString());
          }
        }

        if (!foundMV) break;
      }
      if (DISCORDANCE_ONLY) {
        Collection<VariantContext> compVCs =
            tracker.getValues(discordanceTrack, context.getLocation());
        if (!isDiscordant(vc, compVCs)) return 0;
      }
      if (CONCORDANCE_ONLY) {
        Collection<VariantContext> compVCs =
            tracker.getValues(concordanceTrack, context.getLocation());
        if (!isConcordant(vc, compVCs)) return 0;
      }

      if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
        continue;

      if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
        continue;

      if (!selectedTypes.contains(vc.getType())) continue;

      VariantContext sub = subsetRecord(vc, samples);
      if ((sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS)
          && (!sub.isFiltered() || !EXCLUDE_FILTERED)) {
        for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
          if (!VariantContextUtils.match(sub, jexl)) {
            return 0;
          }
        }
        if (SELECT_RANDOM_NUMBER) {
          randomlyAddVariant(++variantNumber, sub, ref.getBase());
        } else if (!SELECT_RANDOM_FRACTION
            || (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
          vcfWriter.add(sub);
        }
      }
    }

    return 1;
  }
  /**
   * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority
   * order, if provided. If uniqifySamples is true, the priority order is ignored and names are
   * created by concatenating the VC name with the sample name
   *
   * @param genomeLocParser loc parser
   * @param unsortedVCs collection of unsorted VCs
   * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
   * @param filteredRecordMergeType merge type for filtered records
   * @param genotypeMergeOptions merge option for genotypes
   * @param annotateOrigin should we annotate the set it came from?
   * @param printMessages should we print messages?
   * @param setKey the key name of the set
   * @param filteredAreUncalled are filtered records uncalled?
   * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
   * @return new VariantContext representing the merge of unsortedVCs
   */
  public static VariantContext simpleMerge(
      final GenomeLocParser genomeLocParser,
      final Collection<VariantContext> unsortedVCs,
      final List<String> priorityListOfVCs,
      final FilteredRecordMergeType filteredRecordMergeType,
      final GenotypeMergeType genotypeMergeOptions,
      final boolean annotateOrigin,
      final boolean printMessages,
      final String setKey,
      final boolean filteredAreUncalled,
      final boolean mergeInfoWithMaxAC) {
    if (unsortedVCs == null || unsortedVCs.size() == 0) return null;

    if (annotateOrigin && priorityListOfVCs == null)
      throw new IllegalArgumentException(
          "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts");

    if (genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE)
      verifyUniqueSampleNames(unsortedVCs);

    List<VariantContext> prepaddedVCs =
        sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
    // Make sure all variant contexts are padded with reference base in case of indels if necessary
    List<VariantContext> VCs = new ArrayList<VariantContext>();

    for (VariantContext vc : prepaddedVCs) {
      // also a reasonable place to remove filtered calls, if needed
      if (!filteredAreUncalled || vc.isNotFiltered())
        VCs.add(createVariantContextWithPaddedAlleles(vc, false));
    }
    if (VCs.size() == 0) // everything is filtered out and we're filteredAreUncalled
    return null;

    // establish the baseline info from the first VC
    final VariantContext first = VCs.get(0);
    final String name = first.getSource();
    final Allele refAllele = determineReferenceAllele(VCs);

    final Set<Allele> alleles = new LinkedHashSet<Allele>();
    final Set<String> filters = new TreeSet<String>();
    final Map<String, Object> attributes = new TreeMap<String, Object>();
    final Set<String> inconsistentAttributes = new HashSet<String>();
    final Set<String> variantSources =
        new HashSet<
            String>(); // contains the set of sources we found in our set of VCs that are variant
    final Set<String> rsIDs = new LinkedHashSet<String>(1); // most of the time there's one id

    GenomeLoc loc = getLocation(genomeLocParser, first);
    int depth = 0;
    int maxAC = -1;
    final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>();
    double log10PError = 1;
    VariantContext vcWithMaxAC = null;
    GenotypesContext genotypes = GenotypesContext.create();

    // counting the number of filtered and variant VCs
    int nFiltered = 0;

    boolean remapped = false;

    // cycle through and add info from the other VCs, making sure the loc/reference matches

    for (VariantContext vc : VCs) {
      if (loc.getStart() != vc.getStart()) // || !first.getReference().equals(vc.getReference()) )
      throw new ReviewedStingException(
            "BUG: attempting to merge VariantContexts with different start sites: first="
                + first.toString()
                + " second="
                + vc.toString());

      if (getLocation(genomeLocParser, vc).size() > loc.size())
        loc = getLocation(genomeLocParser, vc); // get the longest location

      nFiltered += vc.isFiltered() ? 1 : 0;
      if (vc.isVariant()) variantSources.add(vc.getSource());

      AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles);
      remapped = remapped || alleleMapping.needsRemapping();

      alleles.addAll(alleleMapping.values());

      mergeGenotypes(
          genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);

      log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1);

      filters.addAll(vc.getFilters());

      //
      // add attributes
      //
      // special case DP (add it up) and ID (just preserve it)
      //
      if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
        depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
      if (vc.hasID()) rsIDs.add(vc.getID());
      if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
        String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
        // lets see if the string contains a , separator
        if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
          List<String> alleleCountArray =
              Arrays.asList(
                  rawAlleleCounts
                      .substring(1, rawAlleleCounts.length() - 1)
                      .split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
          for (String alleleCount : alleleCountArray) {
            final int ac = Integer.valueOf(alleleCount.trim());
            if (ac > maxAC) {
              maxAC = ac;
              vcWithMaxAC = vc;
            }
          }
        } else {
          final int ac = Integer.valueOf(rawAlleleCounts);
          if (ac > maxAC) {
            maxAC = ac;
            vcWithMaxAC = vc;
          }
        }
      }

      for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
        String key = p.getKey();
        // if we don't like the key already, don't go anywhere
        if (!inconsistentAttributes.contains(key)) {
          boolean alreadyFound = attributes.containsKey(key);
          Object boundValue = attributes.get(key);
          boolean boundIsMissingValue =
              alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);

          if (alreadyFound && !boundValue.equals(p.getValue()) && !boundIsMissingValue) {
            // we found the value but we're inconsistent, put it in the exclude list
            // System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue,
            // p.getValue());
            inconsistentAttributes.add(key);
            attributes.remove(key);
          } else if (!alreadyFound || boundIsMissingValue) { // no value
            // if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(),
            // p.getValue());
            attributes.put(key, p.getValue());
          }
        }
      }
    }

    // if we have more alternate alleles in the merged VC than in one or more of the
    // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well
    // as allele-dependent attributes like AC,AF
    for (VariantContext vc : VCs) {
      if (vc.alleles.size() == 1) continue;
      if (hasPLIncompatibleAlleles(alleles, vc.alleles)) {
        if (!genotypes.isEmpty())
          logger.warn(
              String.format(
                  "Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
                  genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
        genotypes = stripPLs(genotypes);
        // this will remove stale AC,AF attributed from vc
        calculateChromosomeCounts(vc, attributes, true);
        break;
      }
    }

    // take the VC with the maxAC and pull the attributes into a modifiable map
    if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
      attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
    }

    // if at least one record was unfiltered and we want a union, clear all of the filters
    if ((filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED
            && nFiltered != VCs.size())
        || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL) filters.clear();

    if (annotateOrigin) { // we care about where the call came from
      String setValue;
      if (nFiltered == 0
          && variantSources.size() == priorityListOfVCs.size()) // nothing was unfiltered
      setValue = MERGE_INTERSECTION;
      else if (nFiltered == VCs.size()) // everything was filtered out
      setValue = MERGE_FILTER_IN_ALL;
      else if (variantSources.isEmpty()) // everyone was reference
      setValue = MERGE_REF_IN_ALL;
      else {
        LinkedHashSet<String> s = new LinkedHashSet<String>();
        for (VariantContext vc : VCs)
          if (vc.isVariant())
            s.add(vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource());
        setValue = Utils.join("-", s);
      }

      if (setKey != null) {
        attributes.put(setKey, setValue);
        if (mergeInfoWithMaxAC && vcWithMaxAC != null) {
          attributesWithMaxAC.put(setKey, vcWithMaxAC.getSource());
        }
      }
    }

    if (depth > 0) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));

    final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

    final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
    builder.loc(loc.getContig(), loc.getStart(), loc.getStop());
    builder.alleles(alleles);
    builder.genotypes(genotypes);
    builder.log10PError(log10PError);
    builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);

    // Trim the padded bases of all alleles if necessary
    VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
    if (printMessages && remapped) System.out.printf("Remapped => %s%n", merged);
    return merged;
  }