Ejemplo n.º 1
0
  @Test(dataProvider = "data")
  public void testProgramGroupAndReadGroupMerge(File inputFiles[], File expectedOutputFile)
      throws IOException {

    BufferedReader reader = new BufferedReader(new FileReader(expectedOutputFile));

    String line;
    String expected_output = "";
    while ((line = reader.readLine()) != null) {
      expected_output += line + "\n";
    }

    final List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
    for (final File inFile : inputFiles) {
      IOUtil.assertFileIsReadable(inFile);
      final SAMFileReader in = new SAMFileReader(inFile);
      // We are now checking for zero-length reads, so suppress complaint about that.
      in.setValidationStringency(ValidationStringency.SILENT);
      readers.add(in);
      headers.add(in.getFileHeader());
    }
    final MergingSamRecordIterator iterator;

    final SamFileHeaderMerger headerMerger =
        new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true);
    iterator = new MergingSamRecordIterator(headerMerger, readers, false);

    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    SAMFileWriter writer =
        new SAMFileWriterFactory().makeSAMWriter(headerMerger.getMergedHeader(), true, baos);
    while (iterator.hasNext()) {
      writer.addAlignment(iterator.next());
    }
    writer.close();

    String actual_output = StringUtil.bytesToString(baos.toByteArray());

    List<String> actual = Arrays.asList(actual_output.split("\\n"));
    List<String> expected = Arrays.asList(expected_output.split("\\n"));
    for (int i = 0; i < expected.size(); i++) {
      if (expected.get(i).startsWith("@")) {
        Assert.assertTrue(headersEquivalent(actual.get(i), expected.get(i)));
      } else {
        List<String> expectedSamParts = Arrays.asList(expected.get(i).split("\\s*"));
        List<String> actualSamParts = Arrays.asList(actual.get(i).split("\\s*"));
        for (String exp : expectedSamParts) {
          Assert.assertTrue(actualSamParts.contains(exp));
        }
        for (String act : actualSamParts) {
          Assert.assertTrue(expectedSamParts.contains(act));
        }
      }
    }
  }
Ejemplo n.º 2
0
 private void makeFastqRecords(
     final FastqRecord[] recs,
     final int[] indices,
     final ClusterData cluster,
     final boolean appendReadNumberSuffix) {
   for (short i = 0; i < indices.length; ++i) {
     final ReadData readData = cluster.getRead(indices[i]);
     final String readBases = StringUtil.bytesToString(readData.getBases()).replace('.', 'N');
     final String readName =
         readNameEncoder.generateReadName(cluster, appendReadNumberSuffix ? i + 1 : null);
     recs[i] =
         new FastqRecord(
             readName, readBases, null, SAMUtils.phredToFastq(readData.getQualities()));
   }
 }
Ejemplo n.º 3
0
 @Test
 public void testTrailingWhitespace() throws Exception {
   final File fasta = File.createTempFile("test", ".fasta");
   fasta.deleteOnExit();
   final PrintWriter writer = new PrintWriter(fasta);
   final String chr1 = "chr1";
   writer.println(">" + chr1);
   final String sequence = "ACGTACGT";
   writer.println(sequence);
   writer.println(sequence + " \t");
   writer.close();
   final FastaSequenceFile fastaReader = new FastaSequenceFile(fasta, true);
   final ReferenceSequence referenceSequence = fastaReader.nextSequence();
   Assert.assertEquals(referenceSequence.getName(), chr1);
   Assert.assertEquals(
       StringUtil.bytesToString(referenceSequence.getBases()), sequence + sequence);
 }
Ejemplo n.º 4
0
  @Override
  protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);

    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);

    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
      refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);

    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out =
        new VariantContextWriterBuilder()
            .setOption(Options.INDEX_ON_THE_FLY)
            .setOutputFile(OUTPUT)
            .setReferenceDictionary(walker.getSequenceDictionary())
            .build();
    out.writeHeader(outHeader);

    final VariantContextWriter rejects =
        new VariantContextWriterBuilder()
            .setOutputFile(REJECT)
            .unsetOption(Options.INDEX_ON_THE_FLY)
            .build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);

    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");

    final SortingCollection<VariantContext> sorter =
        SortingCollection.newInstance(
            VariantContext.class,
            new VCFRecordCodec(outHeader),
            outHeader.getVCFRecordComparator(),
            MAX_RECORDS_IN_RAM,
            TMP_DIR);

    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");

    for (final VariantContext ctx : in) {
      ++total;
      final Interval source =
          new Interval(
              ctx.getContig(),
              ctx.getStart(),
              ctx.getEnd(),
              false,
              ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
      final Interval target = liftOver.liftOver(source, 1.0);

      if (target == null) {
        rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
        failedLiftover++;
      } else {
        // Fix the alleles if we went from positive to negative strand
        final List<Allele> alleles = new ArrayList<>();
        for (final Allele oldAllele : ctx.getAlleles()) {
          if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
            alleles.add(oldAllele);
          } else {
            alleles.add(
                Allele.create(
                    SequenceUtil.reverseComplement(oldAllele.getBaseString()),
                    oldAllele.isReference()));
          }
        }

        // Build the new variant context
        final VariantContextBuilder builder =
            new VariantContextBuilder(
                ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);

        builder.id(ctx.getID());
        builder.attributes(ctx.getAttributes());
        builder.genotypes(ctx.getGenotypes());
        builder.filters(ctx.getFilters());
        builder.log10PError(ctx.getLog10PError());

        // Check that the reference allele still agrees with the reference sequence
        boolean mismatchesReference = false;
        for (final Allele allele : builder.getAlleles()) {
          if (allele.isReference()) {
            final byte[] ref = refSeqs.get(target.getContig());
            final String refString =
                StringUtil.bytesToString(ref, target.getStart() - 1, target.length());

            if (!refString.equalsIgnoreCase(allele.getBaseString())) {
              mismatchesReference = true;
            }

            break;
          }
        }

        if (mismatchesReference) {
          rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
          failedAlleleCheck++;
        } else {
          sorter.add(builder.make());
        }
      }

      progress.record(ctx.getContig(), ctx.getStart());
    }

    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(
        Long.toString(failedAlleleCheck),
        " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");

    rejects.close();
    in.close();

    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");

    for (final VariantContext ctx : sorter) {
      out.add(ctx);
      progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();

    return null;
  }
Ejemplo n.º 5
0
 /**
  * Returns the bases represented by this ReferenceSequence as a String. Since this will copy the
  * bases and convert them to two-byte characters, this should not be used on very long reference
  * sequences, but as a convenience when manipulating short sequences returned by {@link
  * ReferenceSequenceFile#getSubsequenceAt(String, long, long)}
  *
  * @return The set of bases represented by this ReferenceSequence, as a String
  */
 public String getBaseString() {
   return StringUtil.bytesToString(bases);
 }