@Test(dataProvider = "data") public void testProgramGroupAndReadGroupMerge(File inputFiles[], File expectedOutputFile) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(expectedOutputFile)); String line; String expected_output = ""; while ((line = reader.readLine()) != null) { expected_output += line + "\n"; } final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(); final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); for (final File inFile : inputFiles) { IOUtil.assertFileIsReadable(inFile); final SAMFileReader in = new SAMFileReader(inFile); // We are now checking for zero-length reads, so suppress complaint about that. in.setValidationStringency(ValidationStringency.SILENT); readers.add(in); headers.add(in.getFileHeader()); } final MergingSamRecordIterator iterator; final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); iterator = new MergingSamRecordIterator(headerMerger, readers, false); ByteArrayOutputStream baos = new ByteArrayOutputStream(); SAMFileWriter writer = new SAMFileWriterFactory().makeSAMWriter(headerMerger.getMergedHeader(), true, baos); while (iterator.hasNext()) { writer.addAlignment(iterator.next()); } writer.close(); String actual_output = StringUtil.bytesToString(baos.toByteArray()); List<String> actual = Arrays.asList(actual_output.split("\\n")); List<String> expected = Arrays.asList(expected_output.split("\\n")); for (int i = 0; i < expected.size(); i++) { if (expected.get(i).startsWith("@")) { Assert.assertTrue(headersEquivalent(actual.get(i), expected.get(i))); } else { List<String> expectedSamParts = Arrays.asList(expected.get(i).split("\\s*")); List<String> actualSamParts = Arrays.asList(actual.get(i).split("\\s*")); for (String exp : expectedSamParts) { Assert.assertTrue(actualSamParts.contains(exp)); } for (String act : actualSamParts) { Assert.assertTrue(expectedSamParts.contains(act)); } } } }
private void makeFastqRecords( final FastqRecord[] recs, final int[] indices, final ClusterData cluster, final boolean appendReadNumberSuffix) { for (short i = 0; i < indices.length; ++i) { final ReadData readData = cluster.getRead(indices[i]); final String readBases = StringUtil.bytesToString(readData.getBases()).replace('.', 'N'); final String readName = readNameEncoder.generateReadName(cluster, appendReadNumberSuffix ? i + 1 : null); recs[i] = new FastqRecord( readName, readBases, null, SAMUtils.phredToFastq(readData.getQualities())); } }
@Test public void testTrailingWhitespace() throws Exception { final File fasta = File.createTempFile("test", ".fasta"); fasta.deleteOnExit(); final PrintWriter writer = new PrintWriter(fasta); final String chr1 = "chr1"; writer.println(">" + chr1); final String sequence = "ACGTACGT"; writer.println(sequence); writer.println(sequence + " \t"); writer.close(); final FastaSequenceFile fastaReader = new FastaSequenceFile(fasta, true); final ReferenceSequence referenceSequence = fastaReader.nextSequence(); Assert.assertEquals(referenceSequence.getName(), chr1); Assert.assertEquals( StringUtil.bytesToString(referenceSequence.getBases()), sequence + sequence); }
@Override protected Object doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); IOUtil.assertFileIsWritable(OUTPUT); IOUtil.assertFileIsWritable(REJECT); //////////////////////////////////////////////////////////////////////// // Setup the inputs //////////////////////////////////////////////////////////////////////// final LiftOver liftOver = new LiftOver(CHAIN); final VCFFileReader in = new VCFFileReader(INPUT, false); logger.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final Map<String, byte[]> refSeqs = new HashMap<>(); for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); } CloserUtil.close(walker); //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// final VCFHeader inHeader = in.getFileHeader(); final VCFHeader outHeader = new VCFHeader(inHeader); outHeader.setSequenceDictionary(walker.getSequenceDictionary()); final VariantContextWriter out = new VariantContextWriterBuilder() .setOption(Options.INDEX_ON_THE_FLY) .setOutputFile(OUTPUT) .setReferenceDictionary(walker.getSequenceDictionary()) .build(); out.writeHeader(outHeader); final VariantContextWriter rejects = new VariantContextWriterBuilder() .setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// long failedLiftover = 0, failedAlleleCheck = 0, total = 0; logger.info("Lifting variants over and sorting."); final SortingCollection<VariantContext> sorter = SortingCollection.newInstance( VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); ProgressLogger progress = new ProgressLogger(logger, 1000000, "read"); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval( ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, 1.0); if (target == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make()); failedLiftover++; } else { // Fix the alleles if we went from positive to negative strand final List<Allele> alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); } else { alleles.add( Allele.create( SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference())); } } // Build the new variant context final VariantContextBuilder builder = new VariantContextBuilder( ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles); builder.id(ctx.getID()); builder.attributes(ctx.getAttributes()); builder.genotypes(ctx.getGenotypes()); builder.filters(ctx.getFilters()); builder.log10PError(ctx.getLog10PError()); // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeqs.get(target.getContig()); final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } break; } } if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; } else { sorter.add(builder.make()); } } progress.record(ctx.getContig(), ctx.getStart()); } final NumberFormat pfmt = new DecimalFormat("0.0000%"); final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total); logger.info("Processed ", total, " variants."); logger.info(Long.toString(failedLiftover), " variants failed to liftover."); logger.info( Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over."); logger.info(pct, " of variants were not successfully lifted over and written to the output."); rejects.close(); in.close(); //////////////////////////////////////////////////////////////////////// // Write the sorted outputs to the final output file //////////////////////////////////////////////////////////////////////// sorter.doneAdding(); progress = new ProgressLogger(logger, 1000000, "written"); logger.info("Writing out sorted records to final VCF."); for (final VariantContext ctx : sorter) { out.add(ctx); progress.record(ctx.getContig(), ctx.getStart()); } out.close(); sorter.cleanup(); return null; }
/** * Returns the bases represented by this ReferenceSequence as a String. Since this will copy the * bases and convert them to two-byte characters, this should not be used on very long reference * sequences, but as a convenience when manipulating short sequences returned by {@link * ReferenceSequenceFile#getSubsequenceAt(String, long, long)} * * @return The set of bases represented by this ReferenceSequence, as a String */ public String getBaseString() { return StringUtil.bytesToString(bases); }