Beispiel #1
0
  private boolean isConcordant(VariantContext vc, Collection<VariantContext> compVCs) {
    if (vc == null || compVCs == null || compVCs.isEmpty()) return false;

    // if we're not looking for specific samples then the fact that we have both VCs is enough to
    // call it concordant.
    if (NO_SAMPLES_SPECIFIED) return true;

    // make a list of all samples contained in this variant VC that are being tracked by the user
    // command line arguments.
    Set<String> variantSamples = vc.getSampleNames();
    variantSamples.retainAll(samples);

    // check if we can find all samples from the variant rod in the comp rod.
    for (String sample : variantSamples) {
      boolean foundSample = false;
      for (VariantContext compVC : compVCs) {
        Genotype varG = vc.getGenotype(sample);
        Genotype compG = compVC.getGenotype(sample);
        if (haveSameGenotypes(varG, compG)) {
          foundSample = true;
          break;
        }
      }
      // if at least one sample doesn't have the same genotype, we don't have concordance
      if (!foundSample) {
        return false;
      }
    }
    return true;
  }
  private void initializeVcfWriter() {
    final List<String> inputNames = Arrays.asList(validation.getName());

    // setup the header fields
    Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
    hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
    hInfo.add(
        new VCFFilterHeaderLine(
            "bootstrap",
            "This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

    bootstrapVCFOutput.writeHeader(
        new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
  }
Beispiel #3
0
  /**
   * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN,
   * AC, AF).
   *
   * @param vc the VariantContext record to subset
   * @param samples the samples to extract
   * @return the subsetted VariantContext
   */
  private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
    if (samples == null || samples.isEmpty()) return vc;

    ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
    for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) {
      if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue());
    }

    VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles());

    // if we have fewer alternate alleles in the selected VC than in the original VC, we need to
    // strip out the GL/PLs (because they are no longer accurate)
    if (vc.getAlleles().size() != sub.getAlleles().size())
      sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes()));

    HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes());

    int depth = 0;
    for (String sample : sub.getSampleNames()) {
      Genotype g = sub.getGenotype(sample);

      if (g.isNotFiltered() && g.isCalled()) {

        String dp = (String) g.getAttribute("DP");
        if (dp != null
            && !dp.equals(VCFConstants.MISSING_DEPTH_v3)
            && !dp.equals(VCFConstants.MISSING_VALUE_v4)) {
          depth += Integer.valueOf(dp);
        }
      }
    }

    if (KEEP_ORIGINAL_CHR_COUNTS) {
      if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY))
        attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY));
      if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY))
        attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY));
      if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY))
        attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY));
    }

    VariantContextUtils.calculateChromosomeCounts(sub, attributes, false);
    attributes.put("DP", depth);

    sub = VariantContext.modifyAttributes(sub, attributes);

    return sub;
  }
Beispiel #4
0
  /** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */
  public void initialize() {
    // Get list of samples to include in the output
    List<String> rodNames = Arrays.asList(variantCollection.variants.getName());

    Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
    TreeSet<String> vcfSamples =
        new TreeSet<String>(
            SampleUtils.getSampleList(
                vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

    Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
    Collection<String> samplesFromExpressions =
        SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);

    // first, add any requested samples
    samples.addAll(samplesFromFile);
    samples.addAll(samplesFromExpressions);
    samples.addAll(sampleNames);

    // if none were requested, we want all of them
    if (samples.isEmpty()) {
      samples.addAll(vcfSamples);
      NO_SAMPLES_SPECIFIED = true;
    }

    // now, exclude any requested samples
    Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
    samples.removeAll(XLsamplesFromFile);
    samples.removeAll(XLsampleNames);

    if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED)
      throw new UserException(
          "All samples requested to be included were also requested to be excluded.");

    for (String sample : samples) logger.info("Including sample '" + sample + "'");

    // if user specified types to include, add these, otherwise, add all possible variant context
    // types to list of vc types to include
    if (TYPES_TO_INCLUDE.isEmpty()) {

      for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t);

    } else {
      for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t);
    }
    // Initialize VCF header
    Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
    headerLines.add(new VCFHeaderLine("source", "SelectVariants"));

    if (KEEP_ORIGINAL_CHR_COUNTS) {
      headerLines.add(
          new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
      headerLines.add(
          new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
      headerLines.add(
          new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
    }
    vcfWriter.writeHeader(new VCFHeader(headerLines, samples));

    for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
      // It's not necessary that the user supply select names for the JEXL expressions, since those
      // expressions will only be needed for omitting records.  Make up the select names here.
      selectNames.add(String.format("select-%d", i));
    }

    jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);

    // Look at the parameters to decide which analysis to perform
    DISCORDANCE_ONLY = discordanceTrack.isBound();
    if (DISCORDANCE_ONLY)
      logger.info(
          "Selecting only variants discordant with the track: " + discordanceTrack.getName());

    CONCORDANCE_ONLY = concordanceTrack.isBound();
    if (CONCORDANCE_ONLY)
      logger.info(
          "Selecting only variants concordant with the track: " + concordanceTrack.getName());

    if (MENDELIAN_VIOLATIONS) {
      if (FAMILY_STRUCTURE_FILE != null) {
        try {
          for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) {
            MendelianViolation mv =
                new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
            if (samples.contains(mv.getSampleChild())
                && samples.contains(mv.getSampleDad())
                && samples.contains(mv.getSampleMom())) mvSet.add(mv);
          }
        } catch (FileNotFoundException e) {
          throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e);
        }
        if (outMVFile != null)
          try {
            outMVFileStream = new PrintStream(outMVFile);
          } catch (FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile(
                outMVFile, "Can't open output file", e);
          }
      } else
        mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
    } else if (!FAMILY_STRUCTURE.isEmpty()) {
      mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
      MENDELIAN_VIOLATIONS = true;
    }

    SELECT_RANDOM_NUMBER = numRandom > 0;
    if (SELECT_RANDOM_NUMBER) {
      logger.info("Selecting " + numRandom + " variants at random from the variant track");
      variantArray = new RandomVariantStructure[numRandom];
    }

    SELECT_RANDOM_FRACTION = fractionRandom > 0;
    if (SELECT_RANDOM_FRACTION)
      logger.info(
          "Selecting approximately "
              + 100.0 * fractionRandom
              + "% of the variants at random from the variant track");
  }