Ejemplo n.º 1
0
  @Test(dataProvider = "loadReadsBAM", groups = "spark")
  public void readsSinkTest(String inputBam, String outputFileName, String outputFileExtension)
      throws IOException {
    final File outputFile = createTempFile(outputFileName, outputFileExtension);
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputFile.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.SINGLE);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getParallelReads(outputFile.getAbsolutePath(), null);
    final List<GATKRead> writtenReads = rddParallelReads2.collect();

    final SAMRecordCoordinateComparator comparator = new SAMRecordCoordinateComparator();
    // Assert that the reads are sorted.
    final int size = writtenReads.size();
    for (int i = 0; i < size - 1; ++i) {
      final SAMRecord smaller = writtenReads.get(i).convertToSAMRecord(header);
      final SAMRecord larger = writtenReads.get(i + 1).convertToSAMRecord(header);
      final int compare = comparator.compare(smaller, larger);
      Assert.assertTrue(
          compare < 0,
          "Reads are out of order (compare="
              + compare
              + "): "
              + smaller.getSAMString()
              + " and "
              + larger.getSAMString());
    }
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());
  }
Ejemplo n.º 2
0
  @Test(dataProvider = "loadReadsBAM", groups = "spark")
  public void readsSinkShardedTest(
      String inputBam, String outputFileName, String outputFileExtension) throws IOException {
    final File outputFile = createTempFile(outputFileName, outputFileExtension);
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    rddParallelReads = rddParallelReads.repartition(2); // ensure that the output is in two shards
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputFile.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.SHARDED);
    int shards =
        outputFile.listFiles((dir, name) -> !name.startsWith(".") && !name.startsWith("_")).length;
    Assert.assertEquals(shards, 2);
    // check that no local .crc files are created
    int crcs =
        outputFile.listFiles((dir, name) -> name.startsWith(".") && name.endsWith(".crc")).length;
    Assert.assertEquals(crcs, 0);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getParallelReads(outputFile.getAbsolutePath(), null);
    // reads are not globally sorted, so don't test that
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());
  }
Ejemplo n.º 3
0
  @Test(dataProvider = "loadReadsADAM", groups = "spark")
  public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
    // Since the test requires that we not create the actual output directory in advance,
    // we instead create its parent directory and mark it for deletion on exit. This protects
    // us from naming collisions across multiple instances of the test suite.
    final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
    final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);

    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());

    // Test the round trip
    List<GATKRead> samList = rddParallelReads.collect();
    List<GATKRead> adamList = rddParallelReads2.collect();
    Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
    samList.sort(comparator);
    adamList.sort(comparator);
    for (int i = 0; i < samList.size(); i++) {
      SAMRecord expected = samList.get(i).convertToSAMRecord(header);
      SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
      // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM ->
      // BAM
      // see https://github.com/bigdatagenomics/adam/issues/823
      Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
      Assert.assertEquals(
          observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
      Assert.assertEquals(
          observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
      Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
      Assert.assertEquals(
          observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
      Assert.assertEquals(
          observed.getMateAlignmentStart(),
          expected.getMateAlignmentStart(),
          "getMateAlignmentStart");
      Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
    }
  }