/**
   * Provides the next record from the underlying iterator after applying filter strings generated
   * by the set of filters in use by the iterator.
   */
  @Override
  public VariantContext next() {
    final VariantContext ctx = this.iterator.next();
    final Set<String> filterStrings = new HashSet<String>();

    // Collect variant level filters
    for (final VariantFilter filter : this.filters) {
      final String val = filter.filter(ctx);
      if (val != null) filterStrings.add(val);
    }

    // Collect genotype level filters in a Map of Sample -> List<filter string>
    final ListMap<String, String> gtFilterStrings = new ListMap<String, String>();
    final Set<String> variantSamples = new HashSet<String>();
    for (final Genotype gt : ctx.getGenotypes()) {
      if (gt.isCalled() && !gt.isHomRef()) variantSamples.add(gt.getSampleName());

      for (final GenotypeFilter filter : gtFilters) {
        final String filterString = filter.filter(ctx, gt);
        if (filterString != null) gtFilterStrings.add(gt.getSampleName(), filterString);
      }
    }

    // If all genotypes are filtered apply a site level filter
    if (gtFilterStrings.keySet().containsAll(variantSamples)) {
      filterStrings.add(ALL_GTS_FILTERED);
    }

    // Make a builder and set the site level filter appropriately
    final VariantContextBuilder builder = new VariantContextBuilder(ctx);
    if (filterStrings.isEmpty()) {
      builder.passFilters();
    } else {
      builder.filters(filterStrings);
    }

    // Apply filters to the necessary genotypes
    builder.noGenotypes();
    final List<Genotype> newGenotypes = new ArrayList<Genotype>(ctx.getNSamples());
    for (final Genotype gt : ctx.getGenotypes()) {
      final GenotypeBuilder gtBuilder = new GenotypeBuilder(gt);
      final List<String> filtersLocal = gtFilterStrings.get(gt.getSampleName());

      if (filtersLocal == null || filtersLocal.isEmpty()) {
        gtBuilder.filter(PASS_FILTER);
      } else {
        gtBuilder.filters(filtersLocal);
      }
      newGenotypes.add(gtBuilder.make());
    }
    builder.genotypes(newGenotypes);

    return builder.make();
  }
  @Override
  protected AFCalculationResult computeLog10PNonRef(
      final VariantContext vc,
      final int defaultPloidy,
      final double[] log10AlleleFrequencyPriors,
      final StateTracker stateTracker) {
    Utils.nonNull(vc, "vc is null");
    Utils.nonNull(log10AlleleFrequencyPriors, "log10AlleleFrequencyPriors is null");
    Utils.nonNull(stateTracker, "stateTracker is null");
    final int numAlternateAlleles = vc.getNAlleles() - 1;

    final List<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
    final int numSamples = genotypeLikelihoods.size() - 1;
    final int numChr = 2 * numSamples;

    // queue of AC conformations to process
    final Deque<ExactACset> ACqueue = new LinkedList<>();

    // mapping of ExactACset indexes to the objects
    final Map<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr + 1);

    // add AC=0 to the queue
    final int[] zeroCounts = new int[numAlternateAlleles];
    final ExactACset zeroSet = new ExactACset(numSamples + 1, new ExactACcounts(zeroCounts));
    ACqueue.add(zeroSet);
    indexesToACset.put(zeroSet.getACcounts(), zeroSet);

    while (!ACqueue.isEmpty()) {

      // compute log10Likelihoods
      final ExactACset set = ACqueue.remove();

      calculateAlleleCountConformation(
          set,
          genotypeLikelihoods,
          numChr,
          ACqueue,
          indexesToACset,
          log10AlleleFrequencyPriors,
          stateTracker);

      // clean up memory
      indexesToACset.remove(set.getACcounts());
    }

    return getResultFromFinalState(vc, log10AlleleFrequencyPriors, stateTracker);
  }
예제 #3
0
  @Override
  public Map<String, Object> annotate(
      final RefMetaDataTracker tracker,
      final AnnotatorCompatible walker,
      final ReferenceContext ref,
      final Map<String, AlignmentContext> stratifiedContexts,
      final VariantContext vc,
      final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {

    final GenotypesContext genotypes = vc.getGenotypes();
    if (genotypes == null || genotypes.size() < MIN_SAMPLES) {
      if (!warningLogged) {
        logger.warn("Too few genotypes");
        warningLogged = true;
      }
      return null;
    }

    int refCount = 0;
    int hetCount = 0;
    int homCount = 0;
    for (final Genotype g : genotypes) {
      if (g.isNoCall()) continue;

      // TODO - fix me:
      // Right now we just ignore genotypes that are not confident, but this throws off
      //  our HW ratios.  More analysis is needed to determine the right thing to do when
      //  the genotyper cannot decide whether a given sample is het or hom var.
      if (g.getLog10PError() > MIN_LOG10_PERROR) continue;

      if (g.isHomRef()) refCount++;
      else if (g.isHet()) hetCount++;
      else homCount++;
    }

    if (refCount + hetCount + homCount == 0) return null;

    double pvalue = HardyWeinbergCalculation.hwCalculate(refCount, hetCount, homCount);
    // System.out.println(refCount + " " + hetCount + " " + homCount + " " + pvalue);
    Map<String, Object> map = new HashMap<>();
    map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue)));
    return map;
  }
  /**
   * Returns a list of attribute values from a VCF file
   *
   * @param vcfFile VCF file
   * @param attributeName attribute name
   * @throws IOException if the file does not exist or can not be opened
   * @return list of attribute values
   */
  private List<String> getAttributeValues(final File vcfFile, final String attributeName)
      throws IOException {
    final VCFCodec codec = new VCFCodec();
    final FileInputStream s = new FileInputStream(vcfFile);
    final LineIterator lineIteratorVCF =
        codec.makeSourceFromStream(new PositionalBufferedStream(s));
    codec.readHeader(lineIteratorVCF);

    List<String> attributeValues = new ArrayList<String>();
    while (lineIteratorVCF.hasNext()) {
      final String line = lineIteratorVCF.next();
      Assert.assertFalse(line == null);
      final VariantContext vc = codec.decode(line);

      for (final Genotype g : vc.getGenotypes()) {
        if (g.hasExtendedAttribute(attributeName)) {
          attributeValues.add((String) g.getExtendedAttribute(attributeName));
        }
      }
    }

    return attributeValues;
  }
  @Override
  public void accumulate(final VariantContext ctx) {
    logger.record(ctx.getContig(), ctx.getStart());

    final String variantChrom = ctx.getContig();
    final int variantPos = ctx.getStart();

    // Skip anything a little too funky
    if (ctx.isFiltered()) return;
    if (!ctx.isVariant()) return;
    if (SKIP_CHROMS.contains(variantChrom)) return;

    for (final MendelianViolationMetrics trio : trios) {
      final Genotype momGt = ctx.getGenotype(trio.MOTHER);
      final Genotype dadGt = ctx.getGenotype(trio.FATHER);
      final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING);

      // if any genotype:
      // - has a non-snp allele; or
      // - lacks a reference allele
      //
      // then ignore this trio
      if (CollectionUtil.makeList(momGt, dadGt, kidGt)
          .stream()
          .anyMatch(
              gt ->
                  gt.isHetNonRef()
                      || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream())
                          .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) {
        continue;
      }

      // if between the trio there are more than 2 alleles including the reference, continue
      if (Stream.concat(
                  Collections.singleton(ctx.getReference()).stream(),
                  CollectionUtil.makeList(momGt, dadGt, kidGt)
                      .stream()
                      .flatMap(gt -> gt.getAlleles().stream()))
              .collect(Collectors.toSet())
              .size()
          > 2) continue;

      // Test to make sure:
      //   1) That the site is in fact variant in the trio
      //   2) that the offspring doesn't have a really wacky het allele balance
      if (!isVariant(momGt, dadGt, kidGt)) continue;
      if (kidGt.isHet()) {
        final int[] ad = kidGt.getAD();
        if (ad == null) continue;

        final List<Integer> adOfAlleles =
            kidGt
                .getAlleles()
                .stream()
                .map(a -> ad[ctx.getAlleleIndex(a)])
                .collect(Collectors.toList());
        final double minAlleleFraction =
            Math.min(adOfAlleles.get(0), adOfAlleles.get(1))
                / (double) (adOfAlleles.get(0) + adOfAlleles.get(1));
        if (minAlleleFraction < MIN_HET_FRACTION) continue;
      }

      ///////////////////////////////////////////////////////////////
      // Determine whether the offspring should be haploid at this
      // locus and which is the parental donor of the haploid genotype
      ///////////////////////////////////////////////////////////////
      boolean haploid = false;
      Genotype haploidParentalGenotype = null;

      if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) {
        if (trio.OFFSPRING_SEX == Sex.Female) {
          // famale
          haploid = false;
        } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) {
          // male but in PAR on X, so diploid
          haploid = false;
        } else {
          // male, out of PAR on X, haploid
          haploid = true;
          haploidParentalGenotype = momGt;
        }
      }

      // the PAR on the male chromosome should be masked so that reads
      // align to the female chromosomes instead, so there's no point
      // of worrying about that here.

      if (MALE_CHROMS.contains(variantChrom)) {
        if (trio.OFFSPRING_SEX == Sex.Male) {
          haploid = true;
          haploidParentalGenotype = dadGt;
        } else {
          continue;
        }
      }

      // We only want to look at sites where we have high enough confidence that the genotypes we
      // are looking at are
      // interesting.  We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is
      // either GQ>=MIN_GQ or in the
      // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ.
      if (haploid
          && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ))
        continue;
      if (!haploid
          && (momGt.isNoCall()
              || momGt.getGQ() < MIN_GQ
              || dadGt.isNoCall()
              || dadGt.getGQ() < MIN_GQ)) continue;
      if (kidGt.isNoCall()) continue;
      if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
        if (kidGt.getPL()[0] < MIN_GQ) continue;
      } else if (kidGt.getGQ() < MIN_GQ) continue;

      // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too
      // low
      if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue;
      if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP))
        continue;

      trio.NUM_VARIANT_SITES++;

      ///////////////////////////////////////////////////////////////
      // First test for haploid violations
      ///////////////////////////////////////////////////////////////
      MendelianViolation type = null;
      if (haploid) {
        if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions

        if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) {
          if (kidGt.isHomRef()) {
            type = MendelianViolation.Haploid_Other;
            trio.NUM_HAPLOID_OTHER++;
          } else {
            type = MendelianViolation.Haploid_Denovo;
            trio.NUM_HAPLOID_DENOVO++;
          }
        }
      }
      ///////////////////////////////////////////////////////////////
      // Then test for diploid mendelian violations
      ///////////////////////////////////////////////////////////////
      else if (isMendelianViolation(momGt, dadGt, kidGt)) {
        if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
          trio.NUM_DIPLOID_DENOVO++;
          type = MendelianViolation.Diploid_Denovo;
        } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) {
          trio.NUM_HOMVAR_HOMVAR_HET++;
          type = MendelianViolation.HomVar_HomVar_Het;
        } else if (kidGt.isHom()
            && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) {
          trio.NUM_HOMREF_HOMVAR_HOM++;
          type = MendelianViolation.HomRef_HomVar_Hom;
        } else if (kidGt.isHom()
            && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) {
          trio.NUM_HOM_HET_HOM++;
          type = MendelianViolation.Hom_Het_Hom;
        } else {
          trio.NUM_OTHER++;
          type = MendelianViolation.Other;
        }
      }

      // Output a record into the family's violation VCF
      if (type != null) {
        // Create a new Context subsetted to the three samples
        final VariantContextBuilder builder = new VariantContextBuilder(ctx);
        builder.genotypes(
            ctx.getGenotypes()
                .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING)));
        builder.attribute(MENDELIAN_VIOLATION_KEY, type.name());

        // Copy over some useful attributes from the full context
        if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
          builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY))
          builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY))
          builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));

        // Write out the variant record
        familyToViolations.get(trio.FAMILY_ID).add(builder.make());
      }
    }
  }
예제 #6
0
  @Override
  protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);

    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);

    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
      refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);

    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out =
        new VariantContextWriterBuilder()
            .setOption(Options.INDEX_ON_THE_FLY)
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(walker.getSequenceDictionary())
            .build();
    out.writeHeader(outHeader);

    final VariantContextWriter rejects =
        new VariantContextWriterBuilder()
            .setOutputFile(REJECT)
            .unsetOption(Options.INDEX_ON_THE_FLY)
            .build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);

    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");

    final SortingCollection<VariantContext> sorter =
        SortingCollection.newInstance(
            VariantContext.class,
            new VCFRecordCodec(outHeader),
            outHeader.getVCFRecordComparator(),
            MAX_RECORDS_IN_RAM,
            TMP_DIR);

    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");

    for (final VariantContext ctx : in) {
      ++total;
      final Interval source =
          new Interval(
              ctx.getContig(),
              ctx.getStart(),
              ctx.getEnd(),
              false,
              ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
      final Interval target = liftOver.liftOver(source, 1.0);

      if (target == null) {
        rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
        failedLiftover++;
      } else {
        // Fix the alleles if we went from positive to negative strand
        final List<Allele> alleles = new ArrayList<>();
        for (final Allele oldAllele : ctx.getAlleles()) {
          if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
            alleles.add(oldAllele);
          } else {
            alleles.add(
                Allele.create(
                    SequenceUtil.reverseComplement(oldAllele.getBaseString()),
                    oldAllele.isReference()));
          }
        }

        // Build the new variant context
        final VariantContextBuilder builder =
            new VariantContextBuilder(
                ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);

        builder.id(ctx.getID());
        builder.attributes(ctx.getAttributes());
        builder.genotypes(ctx.getGenotypes());
        builder.filters(ctx.getFilters());
        builder.log10PError(ctx.getLog10PError());

        // Check that the reference allele still agrees with the reference sequence
        boolean mismatchesReference = false;
        for (final Allele allele : builder.getAlleles()) {
          if (allele.isReference()) {
            final byte[] ref = refSeqs.get(target.getContig());
            final String refString =
                StringUtil.bytesToString(ref, target.getStart() - 1, target.length());

            if (!refString.equalsIgnoreCase(allele.getBaseString())) {
              mismatchesReference = true;
            }

            break;
          }
        }

        if (mismatchesReference) {
          rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
          failedAlleleCheck++;
        } else {
          sorter.add(builder.make());
        }
      }

      progress.record(ctx.getContig(), ctx.getStart());
    }

    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(
        Long.toString(failedAlleleCheck),
        " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");

    rejects.close();
    in.close();

    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");

    for (final VariantContext ctx : sorter) {
      out.add(ctx);
      progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();

    return null;
  }