@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); final VCFFileReader in = new VCFFileReader(INPUT, false); logger.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map<String, byte[]> refSeqs = new HashMap<>(); for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); final VCFHeader outHeader = new VCFHeader(inHeader); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); final VariantContextWriter out = new VariantContextWriterBuilder() .setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); out.writeHeader(outHeader); final VariantContextWriter rejects = new VariantContextWriterBuilder() .setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long failedLiftover = 0, failedAlleleCheck = 0, total = 0; logger.info("Lifting variants over and sorting."); final SortingCollection<VariantContext> sorter = SortingCollection.newInstance( VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); ProgressLogger progress = new ProgressLogger(logger, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval( ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, 1.0); if (target == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make()); failedLiftover++; } else { // Fix the alleles if we went from positive to negative strand final List<Allele> alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); } else { alleles.add( Allele.create( SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference())); } } // Build the new variant context final VariantContextBuilder builder = new VariantContextBuilder( ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles); builder.id(ctx.getID()); builder.attributes(ctx.getAttributes()); builder.genotypes(ctx.getGenotypes()); builder.filters(ctx.getFilters()); builder.log10PError(ctx.getLog10PError()); // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeqs.get(target.getContig()); final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } break; } } if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; } else { sorter.add(builder.make()); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); logger.info("Processed ", total, " variants."); logger.info(Long.toString(failedLiftover), " variants failed to liftover."); logger.info( Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over."); logger.info(pct, " of variants were not successfully lifted over and written to the output."); rejects.close(); in.close(); //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(logger, 1000000, "written"); logger.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { out.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } out.close(); sorter.cleanup(); return null; }
@Override protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException { AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec(); String line; StringWriter sw = new StringWriter(); LOG.info("opening tabix file: " + this.TABIX); TabixReader tabix = new TabixReader(this.TABIX); while ((line = tabix.readLine()) != null) { if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) { break; } sw.append(line).append("\n"); } VCFHeader header3 = (VCFHeader) codeIn3.readActualHeader( new LineIteratorImpl( LineReaderUtil.fromBufferedStream( new ByteArrayInputStream(sw.toString().getBytes())))); VCFHeader header1 = r.getHeader(); VCFHeader h2 = new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder()); for (String infoId : this.INFO_IDS) { VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId); if (vihl == null) { LOG.warn("Not INFO=" + infoId + " in " + TABIX); continue; } if (h2.getInfoHeaderLine(infoId) != null) { LOG.warn("Input already contains INFO=" + vihl); } h2.addMetaDataLine(vihl); } if (ALT_CONFLICT_FLAG != null) { h2.addMetaDataLine( new VCFInfoHeaderLine( ALT_CONFLICT_FLAG, 1, VCFHeaderLineType.Flag, "conflict ALT allele with " + this.TABIX)); } w.writeHeader(h2); while (r.hasNext()) { VariantContext ctx1 = r.next(); VariantContextBuilder vcb = new VariantContextBuilder(ctx1); String line2; String BEST_ID = null; boolean best_id_match_alt = false; List<VariantContext> variantsList = new ArrayList<VariantContext>(); int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd())); TabixReader.Iterator iter = null; if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) { iter = tabix.query(array[0], array[1], array[2]); } else { LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd())); } while (iter != null && (line2 = iter.next()) != null) { VariantContext ctx3 = codeIn3.decode(line2); if (ctx3.getStart() != ctx1.getStart()) continue; if (ctx3.getEnd() != ctx1.getEnd()) continue; if (ctx1.getReference().equals(ctx3.getReference()) && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { variantsList.clear(); variantsList.add(ctx3); break; } else { variantsList.add(ctx3); } } for (VariantContext ctx3 : variantsList) { if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) { continue; } if (this.ALT_ALLELES_MATTERS && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { continue; } if (ctx3.getID() != null && this.REPLACE_ID) { if (BEST_ID != null && best_id_match_alt) { // nothing } else { BEST_ID = ctx3.getID(); best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles()); } } for (String id : this.INFO_IDS) { Object info3 = ctx3.getAttribute(id); if (info3 == null) { continue; } Object info1 = ctx1.getAttribute(id); if (info1 != null && !this.REPLACE_INFO_FIELD) { continue; } vcb.attribute(id, info3); } if (ALT_CONFLICT_FLAG != null && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) { vcb.attribute(ALT_CONFLICT_FLAG, true); } } if (BEST_ID != null) { vcb.id(BEST_ID); } w.add(vcb.make()); } tabix.close(); }