private void writeDifferences( final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) { int currentIndex1 = 0, currentIndex2 = 0; final int size1 = source1Alleles.size(), size2 = source2Alleles.size(); VariantContext current1 = source1Alleles.get(0); VariantContext current2 = source2Alleles.get(0); while (currentIndex1 < size1 || currentIndex2 < size2) { if (current1 == null) { writeOne(current2, source2, null); currentIndex2++; current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } else if (current2 == null) { writeOne(current1, source1, null); currentIndex1++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); } else { final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1); final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2); if (loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2)) { String status; if (loc1.getStart() == loc2.getStart()) { final String allele1 = current1.getAlternateAllele(0).getBaseString(); final String allele2 = current2.getAlternateAllele(0).getBaseString(); if (allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1) status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS; else status = SAME_START_DIFFERENT_ALLELES_STATUS; } else { status = OVERLAPPING_EVENTS_STATUS; } writeOne(current1, INTERSECTION_SET, status); currentIndex1++; currentIndex2++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } else if (loc1.isBefore(loc2)) { writeOne(current1, source1, null); currentIndex1++; current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null); } else { writeOne(current2, source2, null); currentIndex2++; current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null); } } } }
private byte[] generateHaplotype( final List<VariantContext> sourceVCs, final ReferenceContext refContext) { final StringBuilder sb = new StringBuilder(); final int startPos = refContext.getWindow().getStart(); int currentPos = startPos; final byte[] reference = refContext.getBases(); for (final VariantContext vc : sourceVCs) { // add any missing reference context int vcStart = vc.getStart(); final int refAlleleLength = vc.getReference().length(); if (refAlleleLength == vc.getEnd() - vc.getStart()) // this is a deletion (whereas for other events the padding base // isn't part of the position) vcStart++; while (currentPos < vcStart) sb.append((char) reference[currentPos++ - startPos]); // add the alt allele sb.append(vc.getAlternateAllele(0).getBaseString()); // skip the reference allele currentPos += refAlleleLength; } // add any missing reference context final int stopPos = refContext.getWindow().getStop(); while (currentPos < stopPos) sb.append((char) reference[currentPos++ - startPos]); return sb.toString().getBytes(); }
@Test public void shouldPreserveSymbolicAlleleCase() { VCFFileReader reader = new VCFFileReader(new File(VariantBaseTest.variantTestDataRoot + "breakpoint.vcf"), false); VariantContext variant = reader.iterator().next(); reader.close(); // VCF v4.1 s1.4.5 // Tools processing VCF files are not required to preserve case in the allele String, except for // IDs, which are case sensitive. Assert.assertTrue(variant.getAlternateAllele(0).getDisplayString().contains("chr12")); }
@Override public void runCommand() { logger.info("MergeVCFColumnsCommand"); /* * Assumptions * (1) Only two vcfs that are sorted with the same contig order * (2) if contigs on it same order, then we will just skip that contig * (3) No overlapping samples allowed * * Output: * A vcf where intersecting sites are merged together and will only return biallelic markers * the info field will be cleared * the only GT FORMAT field will be there */ Collection<File> vcfs = applicationOptions.getVcfs(); String outfile = applicationOptions.getOutFile(); if (vcfs.size() != 2) { throw new IllegalArgumentException("This function requires exactly two vcfs"); } Iterator<File> vcfFileIter = vcfs.iterator(); File vcf1 = vcfFileIter.next(); File vcf2 = vcfFileIter.next(); VCFFileReader reader1 = new VCFFileReader(vcf1, false); VCFFileReader reader2 = new VCFFileReader(vcf2, false); Iterator<VariantContext> iter1 = reader1.iterator(); Iterator<VariantContext> iter2 = reader2.iterator(); VariantContextComparator comparator = new VariantContextComparator(); /* * Merge headers */ VCFHeader header1 = reader1.getFileHeader(); VCFHeader header2 = reader2.getFileHeader(); List<String> samples1 = header1.getGenotypeSamples(); List<String> samples2 = header2.getGenotypeSamples(); List<String> mergedSamples = new ArrayList<>(samples1.size() + samples2.size()); mergedSamples.addAll(samples1); mergedSamples.addAll(samples2); // Validate that there are no duplicates HashSet<String> sampleSet = new HashSet<String>(); for (String id : mergedSamples) { if (sampleSet.contains(id)) { throw new IllegalArgumentException("Duplicate id found: " + id); } else { sampleSet.add(id); } } HashSet<VCFHeaderLine> meta = new HashSet<>(); meta.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "GT")); meta.addAll(header1.getContigLines()); VCFHeader mergedHeader = new VCFHeader(meta, mergedSamples); /* * Create encoder */ VCFEncoder encoder = new VCFEncoder(mergedHeader, false, false); BufferedWriter writer = null; try { if (outfile.endsWith(".gz")) { BlockCompressedOutputStream outstream = new BlockCompressedOutputStream(new File(outfile)); writer = new BufferedWriter(new OutputStreamWriter(outstream)); } else { writer = Files.newBufferedWriter( Paths.get(outfile), Charset.defaultCharset(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } /* * Write header */ VCFHeaderWriter.writeHeader(writer, mergedHeader); logger.info("Wrote header"); VariantContext previous1 = null; VariantContext previous2 = null; int count = 0; int countFile1 = 0; int countFile2 = 0; boolean usePrevious1 = false; boolean usePrevious2 = false; while (iter1.hasNext() || iter2.hasNext()) { if ((iter1.hasNext() || usePrevious1) && (iter2.hasNext() || usePrevious2)) { VariantContext variant1 = null; VariantContext variant2 = null; // if(usePrevious1 == true && usePrevious2 == true && // comparator.compare(previous1,previous2) != 0) { // //then skip both // usePrevious1 = false; // usePrevious2 = false; // } if (usePrevious1) { variant1 = previous1; } else { variant1 = iter1.next(); countFile1++; } if (usePrevious2) { variant2 = previous2; } else { variant2 = iter2.next(); countFile2++; } // check that variants are ordered correctly if (previous1 != null && previous1 != variant1 && comparator.compare(previous1, variant1) > 0) { throw new IllegalStateException( previous1.getContig() + ":" + previous1.getStart() + " > " + variant1.getContig() + ":" + variant1.getStart()); } if (previous2 != null && previous2 != variant2 && comparator.compare(previous2, variant2) > 0) { throw new IllegalStateException( previous2.getContig() + ":" + previous2.getStart() + " > " + variant2.getContig() + ":" + variant2.getStart()); } int cmp = comparator.compare(variant1, variant2); if (cmp < 0) { // logger.info("Skipping VCF1: " + variant1.getContig() + ":" + variant1.getStart() + // "\t" + variant1.getReference().toString() + "\t" + variant1.getAlternateAlleles()); if (usePrevious1 == true && usePrevious2 == true) { // variant1 < variant2 // we need to go to next variant in vcf1 usePrevious1 = false; } usePrevious2 = true; } else if (cmp > 0) { if (usePrevious1 == true && usePrevious2 == true) { // variant1 > variant2 // we need to go to next variant in vcf2 usePrevious2 = false; } usePrevious1 = true; // logger.info("Skipping VCF2: " + variant2.getContig() + ":" + variant2.getStart() + // "\t" + variant2.getReference().toString() + "\t" + variant2.getAlternateAlleles()); } else { // they equal position usePrevious1 = false; usePrevious2 = false; if (variant1.isBiallelic() && variant2.isBiallelic() && variant1.getReference().equals(variant2.getReference()) && variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0))) { // TODO: Finish merging // both variants are bialleleic and the reference and alternative alleles match count++; if (count % 10000 == 0) { logger.info(count + " mergeable variants found"); } VariantContext merged = VariantContextMerger.merge(variant1, variant2); writer.write(encoder.encode(merged)); writer.write("\n"); } else { // skip if they do not equal // logger.info("Skipping: " + variant1.getContig() + ":" + variant1.getStart() + // "\t" + variant1.getReference().toString() + "\t" + // variant1.getAlternateAlleles()); // logger.info("Skipping: " + variant2.getContig() + ":" + variant2.getStart() + // "\t" + variant2.getReference().toString() + "\t" + // variant2.getAlternateAlleles()); } } previous1 = variant1; previous2 = variant2; } else if (iter1.hasNext()) { // just skip remaining variants VariantContext current = iter1.next(); countFile1++; if (previous1 != null && current != null && comparator.compare(previous1, current) > 0) { throw new IllegalStateException( previous1.getContig() + ":" + previous1.getStart() + " > " + current.getContig() + ":" + current.getStart()); } previous1 = current; // logger.info("Skipping: " + previous1.getContig() + ":" + previous1.getStart() + "\t" + // previous1.getReference().toString() + "\t" + previous1.getAlternateAlleles()); } else if (iter2.hasNext()) { // just skip remaining variants // fixed bug/ was iter1 changed to iter2 VariantContext current = iter2.next(); countFile2++; if (previous2 != null && current != null && comparator.compare(previous2, current) > 0) { throw new IllegalStateException( previous2.getContig() + ":" + previous2.getStart() + " > " + current.getContig() + ":" + current.getStart()); } previous2 = current; // logger.info("Skipping: " + previous2.getContig() + ":" + previous2.getStart() + "\t" + // previous2.getReference().toString() + "\t" + previous2.getAlternateAlleles()); } else { throw new IllegalStateException("Error should not of reached this point"); } } reader1.close(); reader2.close(); logger.info(count + " merged variants"); logger.info(countFile1 + " variants in " + vcf1.getAbsolutePath()); logger.info(countFile2 + " variants in " + vcf2.getAbsolutePath()); } catch (Exception e) { e.printStackTrace(); } finally { if (writer != null) { try { logger.info("Flushing writer"); writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } logger.info("finished merging vcfs"); }