private byte[] generateHaplotype( final List<VariantContext> sourceVCs, final ReferenceContext refContext) { final StringBuilder sb = new StringBuilder(); final int startPos = refContext.getWindow().getStart(); int currentPos = startPos; final byte[] reference = refContext.getBases(); for (final VariantContext vc : sourceVCs) { // add any missing reference context int vcStart = vc.getStart(); final int refAlleleLength = vc.getReference().length(); if (refAlleleLength == vc.getEnd() - vc.getStart()) // this is a deletion (whereas for other events the padding base // isn't part of the position) vcStart++; while (currentPos < vcStart) sb.append((char) reference[currentPos++ - startPos]); // add the alt allele sb.append(vc.getAlternateAllele(0).getBaseString()); // skip the reference allele currentPos += refAlleleLength; } // add any missing reference context final int stopPos = refContext.getWindow().getStop(); while (currentPos < stopPos) sb.append((char) reference[currentPos++ - startPos]); return sb.toString().getBytes(); }
@Override protected void doWork(String inputSource, VcfIterator r, VariantContextWriter w) throws IOException { VCFHeader header = r.getHeader(); VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder()); h2.addMetaDataLine( new VCFInfoHeaderLine( TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "metadata added from " + TABIX + " . Format was " + FORMAT)); h2.addMetaDataLine( new VCFHeaderLine( getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine()))); h2.addMetaDataLine( new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion()))); h2.addMetaDataLine( new VCFHeaderLine( getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion())); h2.addMetaDataLine( new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome())); SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header); w.writeHeader(h2); while (r.hasNext()) { VariantContext ctx = progress.watch(r.next()); Set<String> annotations = new HashSet<String>(); CloseableIterator<BedLine> iter = this.bedReader.iterator(ctx.getContig(), ctx.getStart() - 1, ctx.getEnd() + 1); while (iter.hasNext()) { BedLine bedLine = iter.next(); if (!ctx.getContig().equals(bedLine.getContig())) continue; if (ctx.getStart() - 1 >= bedLine.getEnd()) continue; if (ctx.getEnd() - 1 < bedLine.getStart()) continue; String newannot = this.parsedFormat.toString(bedLine); if (!newannot.isEmpty()) annotations.add(VCFUtils.escapeInfoField(newannot)); } CloserUtil.close(iter); if (annotations.isEmpty()) { w.add(ctx); continue; } VariantContextBuilder vcb = new VariantContextBuilder(ctx); vcb.attribute(TAG, annotations.toArray()); w.add(vcb.make()); incrVariantCount(); if (checkOutputError()) break; } progress.finish(); }
private List<SnpEffEffect> parseSnpEffRecord(VariantContext snpEffRecord) { List<SnpEffEffect> parsedEffects = new ArrayList<SnpEffEffect>(); Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY); if (effectFieldValue == null) { return parsedEffects; } // The VCF codec stores multi-valued fields as a List<String>, and single-valued fields as a // String. // We can have either in the case of SnpEff, since there may be one or more than one effect in // this record. List<String> individualEffects; if (effectFieldValue instanceof List) { individualEffects = (List<String>) effectFieldValue; } else { individualEffects = Arrays.asList((String) effectFieldValue); } for (String effectString : individualEffects) { String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER); if (effectNameAndMetadata.length != 2) { logger.warn( String.format( "Malformed SnpEff effect field at %s:%d, skipping: %s", snpEffRecord.getChr(), snpEffRecord.getStart(), effectString)); continue; } String effectName = effectNameAndMetadata[0]; String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1); SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata); if (parsedEffect.isWellFormed()) { parsedEffects.add(parsedEffect); } else { logger.warn( String.format( "Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"", snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString)); } } return parsedEffects; }
protected void noteCurrentRecord(VariantContext vc) { super.noteCurrentRecord(vc); // first, check for errors // then, update mostUpstreamWritableLoc: int mostUpstreamWritableIndex = vc.getStart() - maxCachingStartDistance; this.mostUpstreamWritableLoc = Math.max(BEFORE_MOST_UPSTREAM_LOC, mostUpstreamWritableIndex); }
private void addVariant(final VariantContext ctx) { if (!ctx.getChr().equals(genes.get(0).getChromosome())) return; if (ctx.getStart() >= chromEnd) return; if (ctx.getStart() < chromStart) return; positions.add(ctx.getStart()); for (String sample : ctx.getSampleNames()) { Genotype g = ctx.getGenotype(sample); if (!g.isAvailable()) continue; if (!g.isCalled()) continue; if (g.isNoCall()) continue; if (g.isNonInformative()) continue; Set<Integer> set = sample2positions.get(sample); if (set == null) { set = new HashSet<Integer>(); sample2positions.put(sample, set); } set.add(ctx.getStart()); } }
private Collection<VariantContext> getVariantContexts( RefMetaDataTracker tracker, ReferenceContext ref) { List<Feature> features = tracker.getValues(variants, ref.getLocus()); List<VariantContext> VCs = new ArrayList<VariantContext>(features.size()); for (Feature record : features) { if (VariantContextAdaptors.canBeConvertedToVariantContext(record)) { // we need to special case the HapMap format because indels aren't handled correctly if (record instanceof RawHapMapFeature) { // is it an indel? RawHapMapFeature hapmap = (RawHapMapFeature) record; if (hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING)) { // get the dbsnp object corresponding to this record (needed to help us distinguish // between insertions and deletions) VariantContext dbsnpVC = getDbsnp(hapmap.getName()); if (dbsnpVC == null || dbsnpVC.isMixed()) continue; Map<String, Allele> alleleMap = new HashMap<String, Allele>(2); alleleMap.put( RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); alleleMap.put( RawHapMapFeature.INSERTION, Allele.create( (char) ref.getBase() + ((RawHapMapFeature) record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions hapmap.updatePosition(dbsnpVC.getStart()); if (hapmap.getStart() < ref.getWindow().getStart()) { logger.warn( "Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); continue; } } } // ok, we might actually be able to turn this record in a variant context VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); if (vc != null) // sometimes the track has odd stuff in it that can't be converted VCs.add(vc); } } return VCs; }
private void run(VcfIterator in) { for (; ; ) { VariantContext ctx = null; if (in.hasNext()) ctx = in.next(); if (ctx == null || this.genes.isEmpty() || (!this.genes.isEmpty() && !this.genes.get(0).getChr().equals(ctx.getChr())) || (!this.genes.isEmpty() && this.chromEnd <= ctx.getStart())) { this.print(); if (System.out.checkError()) return; if (ctx == null) return; this.clear(); if (chrom2knownGenes.containsKey(ctx.getChr())) { for (KnownGene g : chrom2knownGenes.get(ctx.getChr())) { if (this.genes.isEmpty()) { if (g.getTxEnd() <= ctx.getStart() || g.getTxStart() > ctx.getEnd()) { continue; } this.addGene(g); } else { if (!(g.getTxStart() > this.chromEnd || g.getTxEnd() <= this.chromStart)) { this.addGene(g); } } } if (genes.isEmpty()) { debug("no gene for " + ctx.getChr() + ":" + ctx.getStart()); } } else { debug("not any gene for " + ctx.getChr()); } } if (!genes.isEmpty() && ctx.getStart() - 1 >= this.chromStart && ctx.getStart() <= this.chromEnd) { this.addVariant(ctx); } } }
@Override public void accumulate(final VariantContext ctx) { logger.record(ctx.getContig(), ctx.getStart()); final String variantChrom = ctx.getContig(); final int variantPos = ctx.getStart(); // Skip anything a little too funky if (ctx.isFiltered()) return; if (!ctx.isVariant()) return; if (SKIP_CHROMS.contains(variantChrom)) return; for (final MendelianViolationMetrics trio : trios) { final Genotype momGt = ctx.getGenotype(trio.MOTHER); final Genotype dadGt = ctx.getGenotype(trio.FATHER); final Genotype kidGt = ctx.getGenotype(trio.OFFSPRING); // if any genotype: // - has a non-snp allele; or // - lacks a reference allele // // then ignore this trio if (CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .anyMatch( gt -> gt.isHetNonRef() || Stream.concat(Stream.of(ctx.getReference()), gt.getAlleles().stream()) .anyMatch(a -> a.length() != 1 || a.isSymbolic()))) { continue; } // if between the trio there are more than 2 alleles including the reference, continue if (Stream.concat( Collections.singleton(ctx.getReference()).stream(), CollectionUtil.makeList(momGt, dadGt, kidGt) .stream() .flatMap(gt -> gt.getAlleles().stream())) .collect(Collectors.toSet()) .size() > 2) continue; // Test to make sure: // 1) That the site is in fact variant in the trio // 2) that the offspring doesn't have a really wacky het allele balance if (!isVariant(momGt, dadGt, kidGt)) continue; if (kidGt.isHet()) { final int[] ad = kidGt.getAD(); if (ad == null) continue; final List<Integer> adOfAlleles = kidGt .getAlleles() .stream() .map(a -> ad[ctx.getAlleleIndex(a)]) .collect(Collectors.toList()); final double minAlleleFraction = Math.min(adOfAlleles.get(0), adOfAlleles.get(1)) / (double) (adOfAlleles.get(0) + adOfAlleles.get(1)); if (minAlleleFraction < MIN_HET_FRACTION) continue; } /////////////////////////////////////////////////////////////// // Determine whether the offspring should be haploid at this // locus and which is the parental donor of the haploid genotype /////////////////////////////////////////////////////////////// boolean haploid = false; Genotype haploidParentalGenotype = null; if (FEMALE_CHROMS.contains(variantChrom) && trio.OFFSPRING_SEX != Sex.Unknown) { if (trio.OFFSPRING_SEX == Sex.Female) { // famale haploid = false; } else if (isInPseudoAutosomalRegion(variantChrom, variantPos)) { // male but in PAR on X, so diploid haploid = false; } else { // male, out of PAR on X, haploid haploid = true; haploidParentalGenotype = momGt; } } // the PAR on the male chromosome should be masked so that reads // align to the female chromosomes instead, so there's no point // of worrying about that here. if (MALE_CHROMS.contains(variantChrom)) { if (trio.OFFSPRING_SEX == Sex.Male) { haploid = true; haploidParentalGenotype = dadGt; } else { continue; } } // We only want to look at sites where we have high enough confidence that the genotypes we // are looking at are // interesting. We want to ensure that parents are always GQ>=MIN_GQ, and that the kid is // either GQ>=MIN_GQ or in the // case where kid is het that the phred-scaled-likelihood of being reference is >=MIN_GQ. if (haploid && (haploidParentalGenotype.isNoCall() || haploidParentalGenotype.getGQ() < MIN_GQ)) continue; if (!haploid && (momGt.isNoCall() || momGt.getGQ() < MIN_GQ || dadGt.isNoCall() || dadGt.getGQ() < MIN_GQ)) continue; if (kidGt.isNoCall()) continue; if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { if (kidGt.getPL()[0] < MIN_GQ) continue; } else if (kidGt.getGQ() < MIN_GQ) continue; // Also filter on the DP for each of the samples - it's possible to miss hets when DP is too // low if (haploid && (kidGt.getDP() < MIN_DP || haploidParentalGenotype.getDP() < MIN_DP)) continue; if (!haploid && (kidGt.getDP() < MIN_DP || momGt.getDP() < MIN_DP || dadGt.getDP() < MIN_DP)) continue; trio.NUM_VARIANT_SITES++; /////////////////////////////////////////////////////////////// // First test for haploid violations /////////////////////////////////////////////////////////////// MendelianViolation type = null; if (haploid) { if (kidGt.isHet()) continue; // Should not see heterozygous calls at haploid regions if (!haploidParentalGenotype.getAlleles().contains(kidGt.getAllele(0))) { if (kidGt.isHomRef()) { type = MendelianViolation.Haploid_Other; trio.NUM_HAPLOID_OTHER++; } else { type = MendelianViolation.Haploid_Denovo; trio.NUM_HAPLOID_DENOVO++; } } } /////////////////////////////////////////////////////////////// // Then test for diploid mendelian violations /////////////////////////////////////////////////////////////// else if (isMendelianViolation(momGt, dadGt, kidGt)) { if (momGt.isHomRef() && dadGt.isHomRef() && !kidGt.isHomRef()) { trio.NUM_DIPLOID_DENOVO++; type = MendelianViolation.Diploid_Denovo; } else if (momGt.isHomVar() && dadGt.isHomVar() && kidGt.isHet()) { trio.NUM_HOMVAR_HOMVAR_HET++; type = MendelianViolation.HomVar_HomVar_Het; } else if (kidGt.isHom() && ((momGt.isHomRef() && dadGt.isHomVar()) || (momGt.isHomVar() && dadGt.isHomRef()))) { trio.NUM_HOMREF_HOMVAR_HOM++; type = MendelianViolation.HomRef_HomVar_Hom; } else if (kidGt.isHom() && ((momGt.isHom() && dadGt.isHet()) || (momGt.isHet() && dadGt.isHom()))) { trio.NUM_HOM_HET_HOM++; type = MendelianViolation.Hom_Het_Hom; } else { trio.NUM_OTHER++; type = MendelianViolation.Other; } } // Output a record into the family's violation VCF if (type != null) { // Create a new Context subsetted to the three samples final VariantContextBuilder builder = new VariantContextBuilder(ctx); builder.genotypes( ctx.getGenotypes() .subsetToSamples(CollectionUtil.makeSet(trio.MOTHER, trio.FATHER, trio.OFFSPRING))); builder.attribute(MENDELIAN_VIOLATION_KEY, type.name()); // Copy over some useful attributes from the full context if (ctx.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) builder.attribute(ORIGINAL_AC, ctx.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) builder.attribute(ORIGINAL_AF, ctx.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); if (ctx.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) builder.attribute(ORIGINAL_AN, ctx.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); // Write out the variant record familyToViolations.get(trio.FAMILY_ID).add(builder.make()); } } }
@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); final VCFFileReader in = new VCFFileReader(INPUT, false); logger.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map<String, byte[]> refSeqs = new HashMap<>(); for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); final VCFHeader outHeader = new VCFHeader(inHeader); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); final VariantContextWriter out = new VariantContextWriterBuilder() .setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); out.writeHeader(outHeader); final VariantContextWriter rejects = new VariantContextWriterBuilder() .setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long failedLiftover = 0, failedAlleleCheck = 0, total = 0; logger.info("Lifting variants over and sorting."); final SortingCollection<VariantContext> sorter = SortingCollection.newInstance( VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); ProgressLogger progress = new ProgressLogger(logger, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval( ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, 1.0); if (target == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make()); failedLiftover++; } else { // Fix the alleles if we went from positive to negative strand final List<Allele> alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); } else { alleles.add( Allele.create( SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference())); } } // Build the new variant context final VariantContextBuilder builder = new VariantContextBuilder( ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles); builder.id(ctx.getID()); builder.attributes(ctx.getAttributes()); builder.genotypes(ctx.getGenotypes()); builder.filters(ctx.getFilters()); builder.log10PError(ctx.getLog10PError()); // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeqs.get(target.getContig()); final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } break; } } if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; } else { sorter.add(builder.make()); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); logger.info("Processed ", total, " variants."); logger.info(Long.toString(failedLiftover), " variants failed to liftover."); logger.info( Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over."); logger.info(pct, " of variants were not successfully lifted over and written to the output."); rejects.close(); in.close(); //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(logger, 1000000, "written"); logger.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { out.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } out.close(); sorter.cleanup(); return null; }
@Override public void runCommand() { logger.info("MergeVCFColumnsCommand"); /* * Assumptions * (1) Only two vcfs that are sorted with the same contig order * (2) if contigs on it same order, then we will just skip that contig * (3) No overlapping samples allowed * * Output: * A vcf where intersecting sites are merged together and will only return biallelic markers * the info field will be cleared * the only GT FORMAT field will be there */ Collection<File> vcfs = applicationOptions.getVcfs(); String outfile = applicationOptions.getOutFile(); if (vcfs.size() != 2) { throw new IllegalArgumentException("This function requires exactly two vcfs"); } Iterator<File> vcfFileIter = vcfs.iterator(); File vcf1 = vcfFileIter.next(); File vcf2 = vcfFileIter.next(); VCFFileReader reader1 = new VCFFileReader(vcf1, false); VCFFileReader reader2 = new VCFFileReader(vcf2, false); Iterator<VariantContext> iter1 = reader1.iterator(); Iterator<VariantContext> iter2 = reader2.iterator(); VariantContextComparator comparator = new VariantContextComparator(); /* * Merge headers */ VCFHeader header1 = reader1.getFileHeader(); VCFHeader header2 = reader2.getFileHeader(); List<String> samples1 = header1.getGenotypeSamples(); List<String> samples2 = header2.getGenotypeSamples(); List<String> mergedSamples = new ArrayList<>(samples1.size() + samples2.size()); mergedSamples.addAll(samples1); mergedSamples.addAll(samples2); // Validate that there are no duplicates HashSet<String> sampleSet = new HashSet<String>(); for (String id : mergedSamples) { if (sampleSet.contains(id)) { throw new IllegalArgumentException("Duplicate id found: " + id); } else { sampleSet.add(id); } } HashSet<VCFHeaderLine> meta = new HashSet<>(); meta.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "GT")); meta.addAll(header1.getContigLines()); VCFHeader mergedHeader = new VCFHeader(meta, mergedSamples); /* * Create encoder */ VCFEncoder encoder = new VCFEncoder(mergedHeader, false, false); BufferedWriter writer = null; try { if (outfile.endsWith(".gz")) { BlockCompressedOutputStream outstream = new BlockCompressedOutputStream(new File(outfile)); writer = new BufferedWriter(new OutputStreamWriter(outstream)); } else { writer = Files.newBufferedWriter( Paths.get(outfile), Charset.defaultCharset(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } /* * Write header */ VCFHeaderWriter.writeHeader(writer, mergedHeader); logger.info("Wrote header"); VariantContext previous1 = null; VariantContext previous2 = null; int count = 0; int countFile1 = 0; int countFile2 = 0; boolean usePrevious1 = false; boolean usePrevious2 = false; while (iter1.hasNext() || iter2.hasNext()) { if ((iter1.hasNext() || usePrevious1) && (iter2.hasNext() || usePrevious2)) { VariantContext variant1 = null; VariantContext variant2 = null; // if(usePrevious1 == true && usePrevious2 == true && // comparator.compare(previous1,previous2) != 0) { // //then skip both // usePrevious1 = false; // usePrevious2 = false; // } if (usePrevious1) { variant1 = previous1; } else { variant1 = iter1.next(); countFile1++; } if (usePrevious2) { variant2 = previous2; } else { variant2 = iter2.next(); countFile2++; } // check that variants are ordered correctly if (previous1 != null && previous1 != variant1 && comparator.compare(previous1, variant1) > 0) { throw new IllegalStateException( previous1.getContig() + ":" + previous1.getStart() + " > " + variant1.getContig() + ":" + variant1.getStart()); } if (previous2 != null && previous2 != variant2 && comparator.compare(previous2, variant2) > 0) { throw new IllegalStateException( previous2.getContig() + ":" + previous2.getStart() + " > " + variant2.getContig() + ":" + variant2.getStart()); } int cmp = comparator.compare(variant1, variant2); if (cmp < 0) { // logger.info("Skipping VCF1: " + variant1.getContig() + ":" + variant1.getStart() + // "\t" + variant1.getReference().toString() + "\t" + variant1.getAlternateAlleles()); if (usePrevious1 == true && usePrevious2 == true) { // variant1 < variant2 // we need to go to next variant in vcf1 usePrevious1 = false; } usePrevious2 = true; } else if (cmp > 0) { if (usePrevious1 == true && usePrevious2 == true) { // variant1 > variant2 // we need to go to next variant in vcf2 usePrevious2 = false; } usePrevious1 = true; // logger.info("Skipping VCF2: " + variant2.getContig() + ":" + variant2.getStart() + // "\t" + variant2.getReference().toString() + "\t" + variant2.getAlternateAlleles()); } else { // they equal position usePrevious1 = false; usePrevious2 = false; if (variant1.isBiallelic() && variant2.isBiallelic() && variant1.getReference().equals(variant2.getReference()) && variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0))) { // TODO: Finish merging // both variants are bialleleic and the reference and alternative alleles match count++; if (count % 10000 == 0) { logger.info(count + " mergeable variants found"); } VariantContext merged = VariantContextMerger.merge(variant1, variant2); writer.write(encoder.encode(merged)); writer.write("\n"); } else { // skip if they do not equal // logger.info("Skipping: " + variant1.getContig() + ":" + variant1.getStart() + // "\t" + variant1.getReference().toString() + "\t" + // variant1.getAlternateAlleles()); // logger.info("Skipping: " + variant2.getContig() + ":" + variant2.getStart() + // "\t" + variant2.getReference().toString() + "\t" + // variant2.getAlternateAlleles()); } } previous1 = variant1; previous2 = variant2; } else if (iter1.hasNext()) { // just skip remaining variants VariantContext current = iter1.next(); countFile1++; if (previous1 != null && current != null && comparator.compare(previous1, current) > 0) { throw new IllegalStateException( previous1.getContig() + ":" + previous1.getStart() + " > " + current.getContig() + ":" + current.getStart()); } previous1 = current; // logger.info("Skipping: " + previous1.getContig() + ":" + previous1.getStart() + "\t" + // previous1.getReference().toString() + "\t" + previous1.getAlternateAlleles()); } else if (iter2.hasNext()) { // just skip remaining variants // fixed bug/ was iter1 changed to iter2 VariantContext current = iter2.next(); countFile2++; if (previous2 != null && current != null && comparator.compare(previous2, current) > 0) { throw new IllegalStateException( previous2.getContig() + ":" + previous2.getStart() + " > " + current.getContig() + ":" + current.getStart()); } previous2 = current; // logger.info("Skipping: " + previous2.getContig() + ":" + previous2.getStart() + "\t" + // previous2.getReference().toString() + "\t" + previous2.getAlternateAlleles()); } else { throw new IllegalStateException("Error should not of reached this point"); } } reader1.close(); reader2.close(); logger.info(count + " merged variants"); logger.info(countFile1 + " variants in " + vcf1.getAbsolutePath()); logger.info(countFile2 + " variants in " + vcf2.getAbsolutePath()); } catch (Exception e) { e.printStackTrace(); } finally { if (writer != null) { try { logger.info("Flushing writer"); writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } logger.info("finished merging vcfs"); }
@Override protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException { AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec(); String line; StringWriter sw = new StringWriter(); LOG.info("opening tabix file: " + this.TABIX); TabixReader tabix = new TabixReader(this.TABIX); while ((line = tabix.readLine()) != null) { if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) { break; } sw.append(line).append("\n"); } VCFHeader header3 = (VCFHeader) codeIn3.readActualHeader( new LineIteratorImpl( LineReaderUtil.fromBufferedStream( new ByteArrayInputStream(sw.toString().getBytes())))); VCFHeader header1 = r.getHeader(); VCFHeader h2 = new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder()); for (String infoId : this.INFO_IDS) { VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId); if (vihl == null) { LOG.warn("Not INFO=" + infoId + " in " + TABIX); continue; } if (h2.getInfoHeaderLine(infoId) != null) { LOG.warn("Input already contains INFO=" + vihl); } h2.addMetaDataLine(vihl); } if (ALT_CONFLICT_FLAG != null) { h2.addMetaDataLine( new VCFInfoHeaderLine( ALT_CONFLICT_FLAG, 1, VCFHeaderLineType.Flag, "conflict ALT allele with " + this.TABIX)); } w.writeHeader(h2); while (r.hasNext()) { VariantContext ctx1 = r.next(); VariantContextBuilder vcb = new VariantContextBuilder(ctx1); String line2; String BEST_ID = null; boolean best_id_match_alt = false; List<VariantContext> variantsList = new ArrayList<VariantContext>(); int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd())); TabixReader.Iterator iter = null; if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) { iter = tabix.query(array[0], array[1], array[2]); } else { LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd())); } while (iter != null && (line2 = iter.next()) != null) { VariantContext ctx3 = codeIn3.decode(line2); if (ctx3.getStart() != ctx1.getStart()) continue; if (ctx3.getEnd() != ctx1.getEnd()) continue; if (ctx1.getReference().equals(ctx3.getReference()) && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { variantsList.clear(); variantsList.add(ctx3); break; } else { variantsList.add(ctx3); } } for (VariantContext ctx3 : variantsList) { if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) { continue; } if (this.ALT_ALLELES_MATTERS && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { continue; } if (ctx3.getID() != null && this.REPLACE_ID) { if (BEST_ID != null && best_id_match_alt) { // nothing } else { BEST_ID = ctx3.getID(); best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles()); } } for (String id : this.INFO_IDS) { Object info3 = ctx3.getAttribute(id); if (info3 == null) { continue; } Object info1 = ctx1.getAttribute(id); if (info1 != null && !this.REPLACE_INFO_FIELD) { continue; } vcb.attribute(id, info3); } if (ALT_CONFLICT_FLAG != null && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { vcb.attribute(ALT_CONFLICT_FLAG, true); } } if (BEST_ID != null) { vcb.id(BEST_ID); } w.add(vcb.make()); } tabix.close(); }
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final CountedData counter = new CountedData(); // For some reason RodWalkers get map calls with null trackers if (tracker == null) return counter; VariantContext vcComp = tracker.getFirstValue(alleles); if (vcComp == null) return counter; // todo - not sure I want this, may be misleading to filter extended indel events. if (isInsideExtendedIndel(vcComp, ref)) return counter; // Do not operate on variants that are not covered to the optional minimum depth if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { counter.nUncovered = 1L; final GVstatus status = getGVstatus(vcComp); if (status == GVstatus.T) counter.nAltNotCalled = 1L; else if (status == GVstatus.F) counter.nRefNotCalled = 1L; else counter.nNoStatusNotCalled = 1L; return counter; } VariantCallContext call; if (vcComp.isSNP()) { call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0); } else if (vcComp.isIndel()) { call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0); } else if (bamIsTruth) { // assume it's a SNP if no variation is present; this is necessary so that we can test // supposed monomorphic sites against the truth bam call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context).get(0); } else { logger.info( "Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); return counter; } boolean writeVariant = true; if (bamIsTruth) { if (call.confidentlyCalled) { // If truth is a confident REF call if (call.isVariant()) { if (vcComp.isVariant()) counter.nAltCalledAlt = 1L; else { counter.nAltCalledRef = 1L; if (printInterestingSites) System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart()); } } // If truth is a confident ALT call else { if (vcComp.isVariant()) { counter.nRefCalledAlt = 1L; if (printInterestingSites) System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart()); } else counter.nRefCalledRef = 1L; } } else { counter.nNotConfidentCalls = 1L; if (printInterestingSites) System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart()); writeVariant = false; } } else { // if (!vcComp.hasExtendedAttribute("GV")) // throw new UserException.BadInput("Variant has no GV annotation in the INFO // field. " + vcComp.getChr() + ":" + vcComp.getStart()); final GVstatus status = getGVstatus(vcComp); if (call.isCalledAlt(callConf)) { if (status == GVstatus.T) counter.nAltCalledAlt = 1L; else if (status == GVstatus.F) { counter.nRefCalledAlt = 1L; if (printInterestingSites) System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart()); } else counter.nNoStatusCalledAlt = 1L; } else if (call.isCalledRef(callConf)) { if (status == GVstatus.T) { counter.nAltCalledRef = 1L; if (printInterestingSites) System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart()); } else if (status == GVstatus.F) counter.nRefCalledRef = 1L; else counter.nNoStatusCalledRef = 1L; } else { counter.nNotConfidentCalls = 1L; if (status == GVstatus.T) counter.nAltNotCalled = 1L; else if (status == GVstatus.F) counter.nRefNotCalled = 1L; else counter.nNoStatusNotCalled = 1L; if (printInterestingSites) System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart()); writeVariant = false; } } if (vcfWriter != null && writeVariant) { if (!vcComp.hasAttribute(GATKVCFConstants.GENOTYPE_AND_VALIDATE_STATUS_KEY)) { vcfWriter.add( new VariantContextBuilder(vcComp) .attribute( GATKVCFConstants.GENOTYPE_AND_VALIDATE_STATUS_KEY, call.isCalledAlt(callConf) ? "ALT" : "REF") .make()); } else vcfWriter.add(vcComp); } return counter; }
@Override protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException { long nChanged = 0L; final String TAG = "INDELFIXED"; VCFHeader header = r.getHeader(); VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), header.getSampleNamesInOrder()); h2.addMetaDataLine( new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS.")); w.writeHeader(h2); final Pattern dna = Pattern.compile("[ATGCatgc]+"); while (r.hasNext()) { VariantContext ctx = r.next(); VariantContextBuilder b = new VariantContextBuilder(ctx); List<Allele> alleles = ctx.getAlternateAlleles(); if (alleles.size() != 1 || !dna.matcher(ctx.getReference().getBaseString()).matches() || !dna.matcher(alleles.get(0).getBaseString()).matches()) { w.add(ctx); continue; } StringBuffer ref = new StringBuffer(ctx.getReference().getBaseString().toUpperCase()); StringBuffer alt = new StringBuffer(alleles.get(0).getBaseString().toUpperCase()); int start = ctx.getStart(); int end = ctx.getEnd(); boolean changed = false; /** ** we trim on the right side *** */ // REF=TGCTGCGGGGGCCGCTGCGGGGG ALT=TGCTGCGGGGG while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) { changed = true; ref.setLength(ref.length() - 1); alt.deleteCharAt(alt.length() - 1); end--; } // REF=TGCTGCGGGGG ALT= TGCTGCGGGGGCCGCTGCGGGGG while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(ref.length() - 1) == alt.charAt(alt.length() - 1)) { changed = true; ref.setLength(ref.length() - 1); alt.deleteCharAt(alt.length() - 1); end--; } /** ** we trim on the left side *** */ // REF=TGCTGCGGGGGCCGCTGCGGGGG ALT=TGCTGCGGGGG while (alt.length() > 1 && alt.length() < ref.length() && ref.charAt(0) == alt.charAt(0)) { changed = true; ref.deleteCharAt(0); alt.deleteCharAt(0); start++; } // REF=TGCTGCGGGGG ALT= TGCTGCGGGGGCCGCTGCGGGGG while (ref.length() > 1 && alt.length() > ref.length() && ref.charAt(0) == alt.charAt(0)) { changed = true; ref.deleteCharAt(0); alt.deleteCharAt(0); start++; } if (!changed) { w.add(ctx); continue; } /* LOG.info(line); LOG.info("ctx.getStart() "+ctx.getStart()); LOG.info("ctx.getEnd() "+ ctx.getEnd()); LOG.info("start " + start); LOG.info("end "+end); LOG.info("ref " + ref.toString()); LOG.info("alt "+alt.toString()); */ Allele newRef = Allele.create(ref.toString(), true); Allele newAlt = Allele.create(alt.toString(), false); Allele newalleles[] = new Allele[] {newRef, newAlt}; b.attribute( TAG, ctx.getReference().getBaseString() + "|" + alleles.get(0).getBaseString() + "|" + ctx.getStart()); b.start(start); b.stop(end); b.alleles(Arrays.asList(newalleles)); nChanged++; VariantContext ctx2 = b.make(); try { w.add(ctx2); } catch (TribbleException err) { error(err, "Cannot convert new context:" + ctx2 + " old context:" + ctx); w.add(ctx); } } info("indels changed:" + nChanged); }
@Override protected int execute() throws Exception { BasicConfigurator.configure(); logger.setLevel(Level.INFO); final ReferenceSequenceFile ref; try { ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); } catch (Exception e) { throw new UserException("Couldn't load provided reference sequence file " + refFile, e); } variant = parseVariantList(variant); Comparator<Pair<Integer, File>> positionComparator = new PositionComparator(); Queue<Pair<Integer, File>> priorityQueue; if (assumeSorted) priorityQueue = new LinkedList<>(); else priorityQueue = new PriorityQueue<>(10000, positionComparator); FileType fileType = null; for (File file : variant) { // if it returns a valid type, it will be the same for all files fileType = fileExtensionCheck(file, fileType); if (fileType == FileType.INVALID) return 1; if (assumeSorted) { priorityQueue.add(new Pair<>(0, file)); } else { if (!file.exists()) { throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath())); } FeatureReader<VariantContext> reader = getFeatureReader(fileType, file); Iterator<VariantContext> it = reader.iterator(); if (!it.hasNext()) { System.err.println( String.format("File %s is empty. This file will be ignored", file.getAbsolutePath())); continue; } VariantContext vc = it.next(); int firstPosition = vc.getStart(); reader.close(); priorityQueue.add(new Pair<>(firstPosition, file)); } } FileOutputStream outputStream = new FileOutputStream(outputFile); EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); IndexCreator idxCreator = GATKVCFUtils.makeIndexCreator( variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary()); final VariantContextWriter outputWriter = VariantContextWriterFactory.create( outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); boolean firstFile = true; int count = 0; while (!priorityQueue.isEmpty()) { count++; File file = priorityQueue.remove().getSecond(); if (!file.exists()) { throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath())); } FeatureReader<VariantContext> reader = getFeatureReader(fileType, file); if (count % 10 == 0) System.out.print(count); else System.out.print("."); if (firstFile) { VCFHeader header = (VCFHeader) reader.getHeader(); outputWriter.writeHeader(header); firstFile = false; } Iterator<VariantContext> it = reader.iterator(); while (it.hasNext()) { VariantContext vc = it.next(); outputWriter.add(vc); } reader.close(); } System.out.println(); outputWriter.close(); return 0; }