Пример #1
0
  private byte[] generateHaplotype(
      final List<VariantContext> sourceVCs, final ReferenceContext refContext) {

    final StringBuilder sb = new StringBuilder();

    final int startPos = refContext.getWindow().getStart();
    int currentPos = startPos;
    final byte[] reference = refContext.getBases();

    for (final VariantContext vc : sourceVCs) {
      // add any missing reference context
      int vcStart = vc.getStart();
      final int refAlleleLength = vc.getReference().length();
      if (refAlleleLength
          == vc.getEnd()
              - vc.getStart()) // this is a deletion (whereas for other events the padding base
        // isn't part of the position)
        vcStart++;

      while (currentPos < vcStart) sb.append((char) reference[currentPos++ - startPos]);

      // add the alt allele
      sb.append(vc.getAlternateAllele(0).getBaseString());

      // skip the reference allele
      currentPos += refAlleleLength;
    }
    // add any missing reference context
    final int stopPos = refContext.getWindow().getStop();
    while (currentPos < stopPos) sb.append((char) reference[currentPos++ - startPos]);

    return sb.toString().getBytes();
  }
Пример #2
0
  @Override
  protected void doWork(String inputSource, VcfIterator r, VariantContextWriter w)
      throws IOException {
    VCFHeader header = r.getHeader();

    VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder());
    h2.addMetaDataLine(
        new VCFInfoHeaderLine(
            TAG,
            VCFHeaderLineCount.UNBOUNDED,
            VCFHeaderLineType.String,
            "metadata added from " + TABIX + " . Format was " + FORMAT));
    h2.addMetaDataLine(
        new VCFHeaderLine(
            getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
    h2.addMetaDataLine(
        new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
    h2.addMetaDataLine(
        new VCFHeaderLine(
            getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
    h2.addMetaDataLine(
        new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));

    SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
    w.writeHeader(h2);
    while (r.hasNext()) {
      VariantContext ctx = progress.watch(r.next());
      Set<String> annotations = new HashSet<String>();

      CloseableIterator<BedLine> iter =
          this.bedReader.iterator(ctx.getContig(), ctx.getStart() - 1, ctx.getEnd() + 1);
      while (iter.hasNext()) {
        BedLine bedLine = iter.next();

        if (!ctx.getContig().equals(bedLine.getContig())) continue;
        if (ctx.getStart() - 1 >= bedLine.getEnd()) continue;
        if (ctx.getEnd() - 1 < bedLine.getStart()) continue;

        String newannot = this.parsedFormat.toString(bedLine);
        if (!newannot.isEmpty()) annotations.add(VCFUtils.escapeInfoField(newannot));
      }
      CloserUtil.close(iter);

      if (annotations.isEmpty()) {
        w.add(ctx);
        continue;
      }
      VariantContextBuilder vcb = new VariantContextBuilder(ctx);
      vcb.attribute(TAG, annotations.toArray());
      w.add(vcb.make());
      incrVariantCount();
      if (checkOutputError()) break;
    }
    progress.finish();
  }
Пример #3
0
  private List<SnpEffEffect> parseSnpEffRecord(VariantContext snpEffRecord) {
    List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>();

    Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY);
    if (effectFieldValue == null) {
      return parsedEffects;
    }

    // The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a
    // String.
    // We can have either in the case of SnpEff, since there may be one or more than one effect in
    // this record.
    List<String> individualEffects;
    if (effectFieldValue instanceof List) {
      individualEffects = (List<String>) effectFieldValue;
    } else {
      individualEffects = Arrays.asList((String) effectFieldValue);
    }

    for (String effectString : individualEffects) {
      String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER);

      if (effectNameAndMetadata.length != 2) {
        logger.warn(
            String.format(
                "Malformed SnpEff effect field at %s:%d, skipping: %s",
                snpEffRecord.getChr(), snpEffRecord.getStart(), effectString));
        continue;
      }

      String effectName = effectNameAndMetadata[0];
      String[] effectMetadata =
          effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1);

      SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata);

      if (parsedEffect.isWellFormed()) {
        parsedEffects.add(parsedEffect);
      } else {
        logger.warn(
            String.format(
                "Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"",
                snpEffRecord.getChr(),
                snpEffRecord.getStart(),
                parsedEffect.getParseError(),
                effectString));
      }
    }

    return parsedEffects;
  }
  protected void noteCurrentRecord(VariantContext vc) {
    super.noteCurrentRecord(vc); // first, check for errors

    // then, update mostUpstreamWritableLoc:
    int mostUpstreamWritableIndex = vc.getStart() - maxCachingStartDistance;
    this.mostUpstreamWritableLoc = Math.max(BEFORE_MOST_UPSTREAM_LOC, mostUpstreamWritableIndex);
  }
Пример #5
0
 private void addVariant(final VariantContext ctx) {
   if (!ctx.getChr().equals(genes.get(0).getChromosome())) return;
   if (ctx.getStart() >= chromEnd) return;
   if (ctx.getStart() < chromStart) return;
   positions.add(ctx.getStart());
   for (String sample : ctx.getSampleNames()) {
     Genotype g = ctx.getGenotype(sample);
     if (!g.isAvailable()) continue;
     if (!g.isCalled()) continue;
     if (g.isNoCall()) continue;
     if (g.isNonInformative()) continue;
     Set<Integer> set = sample2positions.get(sample);
     if (set == null) {
       set = new HashSet<Integer>();
       sample2positions.put(sample, set);
     }
     set.add(ctx.getStart());
   }
 }
Пример #6
0
  private Collection<VariantContext> getVariantContexts(
      RefMetaDataTracker tracker, ReferenceContext ref) {

    List<Feature> features = tracker.getValues(variants, ref.getLocus());
    List<VariantContext> VCs = new ArrayList<VariantContext>(features.size());

    for (Feature record : features) {
      if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) {
        // we need to special case the HapMap format because indels aren't handled correctly
        if (record instanceof RawHapMapFeature) {

          // is it an indel?
          RawHapMapFeature hapmap = (RawHapMapFeature) record;
          if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING)
              || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) {
            // get the dbsnp object corresponding to this record (needed to help us distinguish
            // between insertions and deletions)
            VariantContext dbsnpVC = getDbsnp(hapmap.getName());
            if (dbsnpVC == null || dbsnpVC.isMixed()) continue;

            Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
            alleleMap.put(
                RawHapMapFeature.DELETION,
                Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion()));
            alleleMap.put(
                RawHapMapFeature.INSERTION,
                Allele.create(
                    (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1],
                    !dbsnpVC.isSimpleInsertion()));
            hapmap.setActualAlleles(alleleMap);

            // also, use the correct positioning for insertions
            hapmap.updatePosition(dbsnpVC.getStart());

            if (hapmap.getStart() < ref.getWindow().getStart()) {
              logger.warn(
                  "Hapmap record at "
                      + ref.getLocus()
                      + " represents an indel too large to be converted; skipping...");
              continue;
            }
          }
        }

        // ok, we might actually be able to turn this record in a variant context
        VariantContext vc =
            VariantContextAdaptors.toVariantContext(variants.getName(), record, ref);

        if (vc != null) // sometimes the track has odd stuff in it that can't be converted
        VCs.add(vc);
      }
    }

    return VCs;
  }
Пример #7
0
  private void run(VcfIterator in) {
    for (; ; ) {
      VariantContext ctx = null;
      if (in.hasNext()) ctx = in.next();

      if (ctx == null
          || this.genes.isEmpty()
          || (!this.genes.isEmpty() && !this.genes.get(0).getChr().equals(ctx.getChr()))
          || (!this.genes.isEmpty() && this.chromEnd <= ctx.getStart())) {
        this.print();
        if (System.out.checkError()) return;
        if (ctx == null) return;
        this.clear();

        if (chrom2knownGenes.containsKey(ctx.getChr())) {
          for (KnownGene g : chrom2knownGenes.get(ctx.getChr())) {
            if (this.genes.isEmpty()) {
              if (g.getTxEnd() <= ctx.getStart() || g.getTxStart() > ctx.getEnd()) {
                continue;
              }
              this.addGene(g);
            } else {
              if (!(g.getTxStart() > this.chromEnd || g.getTxEnd() <= this.chromStart)) {
                this.addGene(g);
              }
            }
          }
          if (genes.isEmpty()) {
            debug("no gene for " + ctx.getChr() + ":" + ctx.getStart());
          }
        } else {
          debug("not any gene for " + ctx.getChr());
        }
      }

      if (!genes.isEmpty()
          && ctx.getStart() - 1 >= this.chromStart
          && ctx.getStart() <= this.chromEnd) {
        this.addVariant(ctx);
      }
    }
  }
  @Override
  public void accumulate(final VariantContext ctx) {
    logger.record(ctx.getContig(), ctx.getStart());

    final String variantChrom = ctx.getContig();
    final int variantPos = ctx.getStart();

    // Skip anything a little too funky
    if (ctx.isFiltered()) return;
    if (!ctx.isVariant()) return;
    if (SKIP_CHROMS.contains(variantChrom)) return;

    for (final MendelianViolationMetrics trio : trios) {
      final Genotype momGt = ctx.getGenotype(trio.MOTHER);
      final Genotype dadGt = ctx.getGenotype(trio.FATHER);
      final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING);

      // if any genotype:
      // - has a non-snp allele; or
      // - lacks a reference allele
      //
      // then ignore this trio
      if (CollectionUtil.makeList(momGt, dadGt, kidGt)
          .stream()
          .anyMatch(
              gt ->
                  gt.isHetNonRef()
                      || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream())
                          .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) {
        continue;
      }

      // if between the trio there are more than 2 alleles including the reference, continue
      if (Stream.concat(
                  Collections.singleton(ctx.getReference()).stream(),
                  CollectionUtil.makeList(momGt, dadGt, kidGt)
                      .stream()
                      .flatMap(gt -> gt.getAlleles().stream()))
              .collect(Collectors.toSet())
              .size()
          > 2) continue;

      // Test to make sure:
      //   1) That the site is in fact variant in the trio
      //   2) that the offspring doesn't have a really wacky het allele balance
      if (!isVariant(momGt, dadGt, kidGt)) continue;
      if (kidGt.isHet()) {
        final int[] ad = kidGt.getAD();
        if (ad == null) continue;

        final List<Integer> adOfAlleles =
            kidGt
                .getAlleles()
                .stream()
                .map(a -> ad[ctx.getAlleleIndex(a)])
                .collect(Collectors.toList());
        final double minAlleleFraction =
            Math.min(adOfAlleles.get(0), adOfAlleles.get(1))
                / (double) (adOfAlleles.get(0) + adOfAlleles.get(1));
        if (minAlleleFraction < MIN_HET_FRACTION) continue;
      }

      ///////////////////////////////////////////////////////////////
      // Determine whether the offspring should be haploid at this
      // locus and which is the parental donor of the haploid genotype
      ///////////////////////////////////////////////////////////////
      boolean haploid = false;
      Genotype haploidParentalGenotype = null;

      if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) {
        if (trio.OFFSPRING_SEX == Sex.Female) {
          // famale
          haploid = false;
        } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) {
          // male but in PAR on X, so diploid
          haploid = false;
        } else {
          // male, out of PAR on X, haploid
          haploid = true;
          haploidParentalGenotype = momGt;
        }
      }

      // the PAR on the male chromosome should be masked so that reads
      // align to the female chromosomes instead, so there's no point
      // of worrying about that here.

      if (MALE_CHROMS.contains(variantChrom)) {
        if (trio.OFFSPRING_SEX == Sex.Male) {
          haploid = true;
          haploidParentalGenotype = dadGt;
        } else {
          continue;
        }
      }

      // We only want to look at sites where we have high enough confidence that the genotypes we
      // are looking at are
      // interesting.  We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is
      // either GQ>=MIN_GQ or in the
      // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ.
      if (haploid
          && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ))
        continue;
      if (!haploid
          && (momGt.isNoCall()
              || momGt.getGQ() < MIN_GQ
              || dadGt.isNoCall()
              || dadGt.getGQ() < MIN_GQ)) continue;
      if (kidGt.isNoCall()) continue;
      if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
        if (kidGt.getPL()[0] < MIN_GQ) continue;
      } else if (kidGt.getGQ() < MIN_GQ) continue;

      // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too
      // low
      if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue;
      if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP))
        continue;

      trio.NUM_VARIANT_SITES++;

      ///////////////////////////////////////////////////////////////
      // First test for haploid violations
      ///////////////////////////////////////////////////////////////
      MendelianViolation type = null;
      if (haploid) {
        if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions

        if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) {
          if (kidGt.isHomRef()) {
            type = MendelianViolation.Haploid_Other;
            trio.NUM_HAPLOID_OTHER++;
          } else {
            type = MendelianViolation.Haploid_Denovo;
            trio.NUM_HAPLOID_DENOVO++;
          }
        }
      }
      ///////////////////////////////////////////////////////////////
      // Then test for diploid mendelian violations
      ///////////////////////////////////////////////////////////////
      else if (isMendelianViolation(momGt, dadGt, kidGt)) {
        if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) {
          trio.NUM_DIPLOID_DENOVO++;
          type = MendelianViolation.Diploid_Denovo;
        } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) {
          trio.NUM_HOMVAR_HOMVAR_HET++;
          type = MendelianViolation.HomVar_HomVar_Het;
        } else if (kidGt.isHom()
            && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) {
          trio.NUM_HOMREF_HOMVAR_HOM++;
          type = MendelianViolation.HomRef_HomVar_Hom;
        } else if (kidGt.isHom()
            && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) {
          trio.NUM_HOM_HET_HOM++;
          type = MendelianViolation.Hom_Het_Hom;
        } else {
          trio.NUM_OTHER++;
          type = MendelianViolation.Other;
        }
      }

      // Output a record into the family's violation VCF
      if (type != null) {
        // Create a new Context subsetted to the three samples
        final VariantContextBuilder builder = new VariantContextBuilder(ctx);
        builder.genotypes(
            ctx.getGenotypes()
                .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING)));
        builder.attribute(MENDELIAN_VIOLATION_KEY, type.name());

        // Copy over some useful attributes from the full context
        if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
          builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY))
          builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
        if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY))
          builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));

        // Write out the variant record
        familyToViolations.get(trio.FAMILY_ID).add(builder.make());
      }
    }
  }
Пример #9
0
  @Override
  protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);

    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);

    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
      refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);

    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out =
        new VariantContextWriterBuilder()
            .setOption(Options.INDEX_ON_THE_FLY)
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(walker.getSequenceDictionary())
            .build();
    out.writeHeader(outHeader);

    final VariantContextWriter rejects =
        new VariantContextWriterBuilder()
            .setOutputFile(REJECT)
            .unsetOption(Options.INDEX_ON_THE_FLY)
            .build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);

    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");

    final SortingCollection<VariantContext> sorter =
        SortingCollection.newInstance(
            VariantContext.class,
            new VCFRecordCodec(outHeader),
            outHeader.getVCFRecordComparator(),
            MAX_RECORDS_IN_RAM,
            TMP_DIR);

    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");

    for (final VariantContext ctx : in) {
      ++total;
      final Interval source =
          new Interval(
              ctx.getContig(),
              ctx.getStart(),
              ctx.getEnd(),
              false,
              ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
      final Interval target = liftOver.liftOver(source, 1.0);

      if (target == null) {
        rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
        failedLiftover++;
      } else {
        // Fix the alleles if we went from positive to negative strand
        final List<Allele> alleles = new ArrayList<>();
        for (final Allele oldAllele : ctx.getAlleles()) {
          if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
            alleles.add(oldAllele);
          } else {
            alleles.add(
                Allele.create(
                    SequenceUtil.reverseComplement(oldAllele.getBaseString()),
                    oldAllele.isReference()));
          }
        }

        // Build the new variant context
        final VariantContextBuilder builder =
            new VariantContextBuilder(
                ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);

        builder.id(ctx.getID());
        builder.attributes(ctx.getAttributes());
        builder.genotypes(ctx.getGenotypes());
        builder.filters(ctx.getFilters());
        builder.log10PError(ctx.getLog10PError());

        // Check that the reference allele still agrees with the reference sequence
        boolean mismatchesReference = false;
        for (final Allele allele : builder.getAlleles()) {
          if (allele.isReference()) {
            final byte[] ref = refSeqs.get(target.getContig());
            final String refString =
                StringUtil.bytesToString(ref, target.getStart() - 1, target.length());

            if (!refString.equalsIgnoreCase(allele.getBaseString())) {
              mismatchesReference = true;
            }

            break;
          }
        }

        if (mismatchesReference) {
          rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
          failedAlleleCheck++;
        } else {
          sorter.add(builder.make());
        }
      }

      progress.record(ctx.getContig(), ctx.getStart());
    }

    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(
        Long.toString(failedAlleleCheck),
        " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");

    rejects.close();
    in.close();

    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");

    for (final VariantContext ctx : sorter) {
      out.add(ctx);
      progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();

    return null;
  }
  @Override
  public void runCommand() {

    logger.info("MergeVCFColumnsCommand");
    /*
     * Assumptions
     * (1) Only two vcfs that are sorted with the same contig order
     * (2) if contigs on it same order, then we will just skip that contig
     * (3) No overlapping samples allowed
     *
     * Output:
     * A vcf where intersecting sites are merged together and will only return biallelic markers
     * the info field will be cleared
     * the only GT FORMAT field will be there
     */

    Collection<File> vcfs = applicationOptions.getVcfs();
    String outfile = applicationOptions.getOutFile();

    if (vcfs.size() != 2) {
      throw new IllegalArgumentException("This function requires exactly two vcfs");
    }

    Iterator<File> vcfFileIter = vcfs.iterator();

    File vcf1 = vcfFileIter.next();
    File vcf2 = vcfFileIter.next();
    VCFFileReader reader1 = new VCFFileReader(vcf1, false);
    VCFFileReader reader2 = new VCFFileReader(vcf2, false);

    Iterator<VariantContext> iter1 = reader1.iterator();
    Iterator<VariantContext> iter2 = reader2.iterator();

    VariantContextComparator comparator = new VariantContextComparator();

    /*
     * Merge headers
     */
    VCFHeader header1 = reader1.getFileHeader();

    VCFHeader header2 = reader2.getFileHeader();

    List<String> samples1 = header1.getGenotypeSamples();
    List<String> samples2 = header2.getGenotypeSamples();

    List<String> mergedSamples = new ArrayList<>(samples1.size() + samples2.size());
    mergedSamples.addAll(samples1);
    mergedSamples.addAll(samples2);

    // Validate that there are no duplicates
    HashSet<String> sampleSet = new HashSet<String>();
    for (String id : mergedSamples) {
      if (sampleSet.contains(id)) {
        throw new IllegalArgumentException("Duplicate id found: " + id);
      } else {
        sampleSet.add(id);
      }
    }

    HashSet<VCFHeaderLine> meta = new HashSet<>();
    meta.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "GT"));
    meta.addAll(header1.getContigLines());
    VCFHeader mergedHeader = new VCFHeader(meta, mergedSamples);

    /*
     * Create encoder
     */
    VCFEncoder encoder = new VCFEncoder(mergedHeader, false, false);

    BufferedWriter writer = null;
    try {
      if (outfile.endsWith(".gz")) {
        BlockCompressedOutputStream outstream = new BlockCompressedOutputStream(new File(outfile));
        writer = new BufferedWriter(new OutputStreamWriter(outstream));
      } else {
        writer =
            Files.newBufferedWriter(
                Paths.get(outfile),
                Charset.defaultCharset(),
                StandardOpenOption.CREATE,
                StandardOpenOption.TRUNCATE_EXISTING);
      }

      /*
       * Write header
       */
      VCFHeaderWriter.writeHeader(writer, mergedHeader);
      logger.info("Wrote header");

      VariantContext previous1 = null;
      VariantContext previous2 = null;

      int count = 0;

      int countFile1 = 0;
      int countFile2 = 0;

      boolean usePrevious1 = false;
      boolean usePrevious2 = false;
      while (iter1.hasNext() || iter2.hasNext()) {

        if ((iter1.hasNext() || usePrevious1) && (iter2.hasNext() || usePrevious2)) {

          VariantContext variant1 = null;

          VariantContext variant2 = null;

          //					if(usePrevious1 == true && usePrevious2 == true &&
          // comparator.compare(previous1,previous2) != 0) {
          //						//then skip both
          //						usePrevious1 = false;
          //						usePrevious2 = false;
          //					}

          if (usePrevious1) {
            variant1 = previous1;
          } else {
            variant1 = iter1.next();
            countFile1++;
          }

          if (usePrevious2) {
            variant2 = previous2;
          } else {
            variant2 = iter2.next();
            countFile2++;
          }

          // check that variants are ordered correctly
          if (previous1 != null
              && previous1 != variant1
              && comparator.compare(previous1, variant1) > 0) {
            throw new IllegalStateException(
                previous1.getContig()
                    + ":"
                    + previous1.getStart()
                    + " > "
                    + variant1.getContig()
                    + ":"
                    + variant1.getStart());
          }

          if (previous2 != null
              && previous2 != variant2
              && comparator.compare(previous2, variant2) > 0) {
            throw new IllegalStateException(
                previous2.getContig()
                    + ":"
                    + previous2.getStart()
                    + " > "
                    + variant2.getContig()
                    + ":"
                    + variant2.getStart());
          }

          int cmp = comparator.compare(variant1, variant2);

          if (cmp < 0) {
            // logger.info("Skipping VCF1: " + variant1.getContig() + ":" + variant1.getStart() +
            // "\t" + variant1.getReference().toString() +  "\t" + variant1.getAlternateAlleles());

            if (usePrevious1 == true && usePrevious2 == true) {
              // variant1 < variant2
              // we need to go to next variant in vcf1
              usePrevious1 = false;
            }

            usePrevious2 = true;
          } else if (cmp > 0) {

            if (usePrevious1 == true && usePrevious2 == true) {
              // variant1 > variant2
              // we need to go to next variant in vcf2
              usePrevious2 = false;
            }

            usePrevious1 = true;
            // logger.info("Skipping VCF2: " + variant2.getContig() + ":" + variant2.getStart() +
            // "\t" + variant2.getReference().toString() +  "\t" + variant2.getAlternateAlleles());
          } else {
            // they equal position
            usePrevious1 = false;
            usePrevious2 = false;

            if (variant1.isBiallelic()
                && variant2.isBiallelic()
                && variant1.getReference().equals(variant2.getReference())
                && variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0))) {

              // TODO: Finish merging
              // both variants are bialleleic and the reference and alternative alleles match

              count++;
              if (count % 10000 == 0) {
                logger.info(count + " mergeable variants found");
              }

              VariantContext merged = VariantContextMerger.merge(variant1, variant2);

              writer.write(encoder.encode(merged));
              writer.write("\n");

            } else {
              // skip if they do not equal

              //			logger.info("Skipping: " + variant1.getContig() + ":" + variant1.getStart() +
              // "\t" + variant1.getReference().toString() +  "\t" +
              // variant1.getAlternateAlleles());
              //			logger.info("Skipping: " + variant2.getContig() + ":" + variant2.getStart() +
              // "\t" + variant2.getReference().toString() +  "\t" +
              // variant2.getAlternateAlleles());
            }
          }

          previous1 = variant1;
          previous2 = variant2;
        } else if (iter1.hasNext()) {
          // just skip remaining variants
          VariantContext current = iter1.next();
          countFile1++;

          if (previous1 != null && current != null && comparator.compare(previous1, current) > 0) {
            throw new IllegalStateException(
                previous1.getContig()
                    + ":"
                    + previous1.getStart()
                    + " > "
                    + current.getContig()
                    + ":"
                    + current.getStart());
          }

          previous1 = current;

          // logger.info("Skipping: " + previous1.getContig() + ":" + previous1.getStart() + "\t" +
          // previous1.getReference().toString() +  "\t" + previous1.getAlternateAlleles());

        } else if (iter2.hasNext()) {
          // just skip remaining variants
          // fixed bug/ was iter1 changed to iter2
          VariantContext current = iter2.next();
          countFile2++;

          if (previous2 != null && current != null && comparator.compare(previous2, current) > 0) {
            throw new IllegalStateException(
                previous2.getContig()
                    + ":"
                    + previous2.getStart()
                    + " > "
                    + current.getContig()
                    + ":"
                    + current.getStart());
          }

          previous2 = current;

          // logger.info("Skipping: " + previous2.getContig() + ":" + previous2.getStart() + "\t" +
          // previous2.getReference().toString() +  "\t" + previous2.getAlternateAlleles());
        } else {
          throw new IllegalStateException("Error should not of reached this point");
        }
      }

      reader1.close();
      reader2.close();

      logger.info(count + "  merged variants");
      logger.info(countFile1 + "  variants in " + vcf1.getAbsolutePath());
      logger.info(countFile2 + "  variants in " + vcf2.getAbsolutePath());

    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (writer != null) {
        try {
          logger.info("Flushing writer");
          writer.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }

    logger.info("finished merging vcfs");
  }
Пример #11
0
  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec();
    String line;

    StringWriter sw = new StringWriter();
    LOG.info("opening tabix file: " + this.TABIX);
    TabixReader tabix = new TabixReader(this.TABIX);

    while ((line = tabix.readLine()) != null) {
      if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) {
        break;
      }
      sw.append(line).append("\n");
    }
    VCFHeader header3 =
        (VCFHeader)
            codeIn3.readActualHeader(
                new LineIteratorImpl(
                    LineReaderUtil.fromBufferedStream(
                        new ByteArrayInputStream(sw.toString().getBytes()))));
    VCFHeader header1 = r.getHeader();

    VCFHeader h2 =
        new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder());
    for (String infoId : this.INFO_IDS) {
      VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId);
      if (vihl == null) {
        LOG.warn("Not INFO=" + infoId + " in " + TABIX);
        continue;
      }
      if (h2.getInfoHeaderLine(infoId) != null) {
        LOG.warn("Input already contains INFO=" + vihl);
      }
      h2.addMetaDataLine(vihl);
    }

    if (ALT_CONFLICT_FLAG != null) {
      h2.addMetaDataLine(
          new VCFInfoHeaderLine(
              ALT_CONFLICT_FLAG,
              1,
              VCFHeaderLineType.Flag,
              "conflict ALT allele with " + this.TABIX));
    }

    w.writeHeader(h2);
    while (r.hasNext()) {
      VariantContext ctx1 = r.next();

      VariantContextBuilder vcb = new VariantContextBuilder(ctx1);
      String line2;
      String BEST_ID = null;
      boolean best_id_match_alt = false;

      List<VariantContext> variantsList = new ArrayList<VariantContext>();

      int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      TabixReader.Iterator iter = null;

      if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) {
        iter = tabix.query(array[0], array[1], array[2]);
      } else {
        LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      }

      while (iter != null && (line2 = iter.next()) != null) {
        VariantContext ctx3 = codeIn3.decode(line2);
        if (ctx3.getStart() != ctx1.getStart()) continue;
        if (ctx3.getEnd() != ctx1.getEnd()) continue;

        if (ctx1.getReference().equals(ctx3.getReference())
            && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          variantsList.clear();
          variantsList.add(ctx3);
          break;
        } else {
          variantsList.add(ctx3);
        }
      }

      for (VariantContext ctx3 : variantsList) {

        if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) {
          continue;
        }
        if (this.ALT_ALLELES_MATTERS
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          continue;
        }

        if (ctx3.getID() != null && this.REPLACE_ID) {
          if (BEST_ID != null && best_id_match_alt) {
            // nothing
          } else {
            BEST_ID = ctx3.getID();
            best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles());
          }
        }

        for (String id : this.INFO_IDS) {
          Object info3 = ctx3.getAttribute(id);
          if (info3 == null) {
            continue;
          }
          Object info1 = ctx1.getAttribute(id);
          if (info1 != null && !this.REPLACE_INFO_FIELD) {
            continue;
          }

          vcb.attribute(id, info3);
        }

        if (ALT_CONFLICT_FLAG != null
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          vcb.attribute(ALT_CONFLICT_FLAG, true);
        }
      }
      if (BEST_ID != null) {
        vcb.id(BEST_ID);
      }
      w.add(vcb.make());
    }
    tabix.close();
  }
  public CountedData map(
      RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {

    final CountedData counter = new CountedData();

    // For some reason RodWalkers get map calls with null trackers
    if (tracker == null) return counter;

    VariantContext vcComp = tracker.getFirstValue(alleles);
    if (vcComp == null) return counter;

    // todo - not sure I want this, may be misleading to filter extended indel events.
    if (isInsideExtendedIndel(vcComp, ref)) return counter;

    // Do not operate on variants that are not covered to the optional minimum depth
    if (!context.hasReads()
        || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
      counter.nUncovered = 1L;
      final GVstatus status = getGVstatus(vcComp);
      if (status == GVstatus.T) counter.nAltNotCalled = 1L;
      else if (status == GVstatus.F) counter.nRefNotCalled = 1L;
      else counter.nNoStatusNotCalled = 1L;

      return counter;
    }

    VariantCallContext call;
    if (vcComp.isSNP()) {
      call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0);
    } else if (vcComp.isIndel()) {
      call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0);
    } else if (bamIsTruth) {
      // assume it's a SNP if no variation is present; this is necessary so that we can test
      // supposed monomorphic sites against the truth bam
      call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0);
    } else {
      logger.info(
          "Not SNP or INDEL "
              + vcComp.getChr()
              + ":"
              + vcComp.getStart()
              + " "
              + vcComp.getAlleles());
      return counter;
    }

    boolean writeVariant = true;

    if (bamIsTruth) {
      if (call.confidentlyCalled) {
        // If truth is a confident REF call
        if (call.isVariant()) {
          if (vcComp.isVariant()) counter.nAltCalledAlt = 1L;
          else {
            counter.nAltCalledRef = 1L;
            if (printInterestingSites)
              System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart());
          }
        }
        // If truth is a confident ALT call
        else {
          if (vcComp.isVariant()) {
            counter.nRefCalledAlt = 1L;
            if (printInterestingSites)
              System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart());
          } else counter.nRefCalledRef = 1L;
        }
      } else {
        counter.nNotConfidentCalls = 1L;
        if (printInterestingSites)
          System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart());
        writeVariant = false;
      }
    } else {
      //            if (!vcComp.hasExtendedAttribute("GV"))
      //                throw new UserException.BadInput("Variant has no GV annotation in the INFO
      // field. " + vcComp.getChr() + ":" + vcComp.getStart());

      final GVstatus status = getGVstatus(vcComp);
      if (call.isCalledAlt(callConf)) {
        if (status == GVstatus.T) counter.nAltCalledAlt = 1L;
        else if (status == GVstatus.F) {
          counter.nRefCalledAlt = 1L;
          if (printInterestingSites)
            System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart());
        } else counter.nNoStatusCalledAlt = 1L;
      } else if (call.isCalledRef(callConf)) {
        if (status == GVstatus.T) {
          counter.nAltCalledRef = 1L;
          if (printInterestingSites)
            System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart());
        } else if (status == GVstatus.F) counter.nRefCalledRef = 1L;
        else counter.nNoStatusCalledRef = 1L;
      } else {
        counter.nNotConfidentCalls = 1L;
        if (status == GVstatus.T) counter.nAltNotCalled = 1L;
        else if (status == GVstatus.F) counter.nRefNotCalled = 1L;
        else counter.nNoStatusNotCalled = 1L;

        if (printInterestingSites)
          System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart());
        writeVariant = false;
      }
    }

    if (vcfWriter != null && writeVariant) {
      if (!vcComp.hasAttribute(GATKVCFConstants.GENOTYPE_AND_VALIDATE_STATUS_KEY)) {
        vcfWriter.add(
            new VariantContextBuilder(vcComp)
                .attribute(
                    GATKVCFConstants.GENOTYPE_AND_VALIDATE_STATUS_KEY,
                    call.isCalledAlt(callConf) ? "ALT" : "REF")
                .make());
      } else vcfWriter.add(vcComp);
    }
    return counter;
  }
Пример #13
0
  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    long nChanged = 0L;
    final String TAG = "INDELFIXED";
    VCFHeader header = r.getHeader();

    VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder());
    h2.addMetaDataLine(
        new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS."));

    w.writeHeader(h2);

    final Pattern dna = Pattern.compile("[ATGCatgc]+");
    while (r.hasNext()) {
      VariantContext ctx = r.next();
      VariantContextBuilder b = new VariantContextBuilder(ctx);
      List<Allele> alleles = ctx.getAlternateAlleles();
      if (alleles.size() != 1
          || !dna.matcher(ctx.getReference().getBaseString()).matches()
          || !dna.matcher(alleles.get(0).getBaseString()).matches()) {
        w.add(ctx);
        continue;
      }
      StringBuffer ref = new StringBuffer(ctx.getReference().getBaseString().toUpperCase());
      StringBuffer alt = new StringBuffer(alleles.get(0).getBaseString().toUpperCase());
      int start = ctx.getStart();
      int end = ctx.getEnd();

      boolean changed = false;

      /** ** we trim on the right side *** */
      // REF=TGCTGCGGGGGCCGCTGCGGGGG 	ALT=TGCTGCGGGGG
      while (alt.length() > 1
          && alt.length() < ref.length()
          && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) {
        changed = true;
        ref.setLength(ref.length() - 1);
        alt.deleteCharAt(alt.length() - 1);
        end--;
      }

      // REF=TGCTGCGGGGG 	ALT= TGCTGCGGGGGCCGCTGCGGGGG
      while (ref.length() > 1
          && alt.length() > ref.length()
          && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) {
        changed = true;
        ref.setLength(ref.length() - 1);
        alt.deleteCharAt(alt.length() - 1);
        end--;
      }

      /** ** we trim on the left side *** */

      // REF=TGCTGCGGGGGCCGCTGCGGGGG 	ALT=TGCTGCGGGGG
      while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(0) == alt.charAt(0)) {
        changed = true;
        ref.deleteCharAt(0);
        alt.deleteCharAt(0);
        start++;
      }

      // REF=TGCTGCGGGGG 	ALT= TGCTGCGGGGGCCGCTGCGGGGG
      while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(0) == alt.charAt(0)) {
        changed = true;
        ref.deleteCharAt(0);
        alt.deleteCharAt(0);
        start++;
      }

      if (!changed) {
        w.add(ctx);
        continue;
      }

      /*
      LOG.info(line);
      LOG.info("ctx.getStart() "+ctx.getStart());
      LOG.info("ctx.getEnd() "+ ctx.getEnd());



      LOG.info("start " + start);
      LOG.info("end "+end);
      LOG.info("ref " + ref.toString());
      LOG.info("alt "+alt.toString());
      */

      Allele newRef = Allele.create(ref.toString(), true);
      Allele newAlt = Allele.create(alt.toString(), false);

      Allele newalleles[] = new Allele[] {newRef, newAlt};

      b.attribute(
          TAG,
          ctx.getReference().getBaseString()
              + "|"
              + alleles.get(0).getBaseString()
              + "|"
              + ctx.getStart());
      b.start(start);
      b.stop(end);
      b.alleles(Arrays.asList(newalleles));

      nChanged++;

      VariantContext ctx2 = b.make();
      try {
        w.add(ctx2);
      } catch (TribbleException err) {
        error(err, "Cannot convert new context:" + ctx2 + " old context:" + ctx);
        w.add(ctx);
      }
    }

    info("indels changed:" + nChanged);
  }
Пример #14
0
  @Override
  protected int execute() throws Exception {
    BasicConfigurator.configure();
    logger.setLevel(Level.INFO);

    final ReferenceSequenceFile ref;
    try {
      ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
    } catch (Exception e) {
      throw new UserException("Couldn't load provided reference sequence file " + refFile, e);
    }

    variant = parseVariantList(variant);

    Comparator<Pair<Integer, File>> positionComparator = new PositionComparator();

    Queue<Pair<Integer, File>> priorityQueue;
    if (assumeSorted) priorityQueue = new LinkedList<>();
    else priorityQueue = new PriorityQueue<>(10000, positionComparator);

    FileType fileType = null;
    for (File file : variant) {
      // if it returns a valid type, it will be the same for all files
      fileType = fileExtensionCheck(file, fileType);
      if (fileType == FileType.INVALID) return 1;

      if (assumeSorted) {
        priorityQueue.add(new Pair<>(0, file));
      } else {
        if (!file.exists()) {
          throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath()));
        }
        FeatureReader<VariantContext> reader = getFeatureReader(fileType, file);
        Iterator<VariantContext> it = reader.iterator();
        if (!it.hasNext()) {
          System.err.println(
              String.format("File %s is empty. This file will be ignored", file.getAbsolutePath()));
          continue;
        }
        VariantContext vc = it.next();
        int firstPosition = vc.getStart();
        reader.close();
        priorityQueue.add(new Pair<>(firstPosition, file));
      }
    }

    FileOutputStream outputStream = new FileOutputStream(outputFile);
    EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
    IndexCreator idxCreator =
        GATKVCFUtils.makeIndexCreator(
            variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary());
    final VariantContextWriter outputWriter =
        VariantContextWriterFactory.create(
            outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);

    boolean firstFile = true;
    int count = 0;
    while (!priorityQueue.isEmpty()) {
      count++;
      File file = priorityQueue.remove().getSecond();
      if (!file.exists()) {
        throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath()));
      }
      FeatureReader<VariantContext> reader = getFeatureReader(fileType, file);

      if (count % 10 == 0) System.out.print(count);
      else System.out.print(".");
      if (firstFile) {
        VCFHeader header = (VCFHeader) reader.getHeader();
        outputWriter.writeHeader(header);
        firstFile = false;
      }

      Iterator<VariantContext> it = reader.iterator();

      while (it.hasNext()) {
        VariantContext vc = it.next();
        outputWriter.add(vc);
      }

      reader.close();
    }
    System.out.println();

    outputWriter.close();

    return 0;
  }