Пример #1
0
  @Test(dataProvider = "loadReadsADAM", groups = "spark")
  public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
    // Since the test requires that we not create the actual output directory in advance,
    // we instead create its parent directory and mark it for deletion on exit. This protects
    // us from naming collisions across multiple instances of the test suite.
    final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
    final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);

    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());

    // Test the round trip
    List<GATKRead> samList = rddParallelReads.collect();
    List<GATKRead> adamList = rddParallelReads2.collect();
    Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
    samList.sort(comparator);
    adamList.sort(comparator);
    for (int i = 0; i < samList.size(); i++) {
      SAMRecord expected = samList.get(i).convertToSAMRecord(header);
      SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
      // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM ->
      // BAM
      // see https://github.com/bigdatagenomics/adam/issues/823
      Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
      Assert.assertEquals(
          observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
      Assert.assertEquals(
          observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
      Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
      Assert.assertEquals(
          observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
      Assert.assertEquals(
          observed.getMateAlignmentStart(),
          expected.getMateAlignmentStart(),
          "getMateAlignmentStart");
      Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
    }
  }
Пример #2
0
  /**
   * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY
   *
   * @param read
   */
  public GATKSAMRecord(final SAMRecord read) {
    super(read.getHeader());
    super.setReferenceIndex(read.getReferenceIndex());
    super.setAlignmentStart(read.getAlignmentStart());
    super.setReadName(read.getReadName());
    super.setMappingQuality(read.getMappingQuality());
    // indexing bin done below
    super.setCigar(read.getCigar());
    super.setFlags(read.getFlags());
    super.setMateReferenceIndex(read.getMateReferenceIndex());
    super.setMateAlignmentStart(read.getMateAlignmentStart());
    super.setInferredInsertSize(read.getInferredInsertSize());
    SAMReadGroupRecord samRG = read.getReadGroup();
    SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read);
    if (samAttr == null) {
      clearAttributes();
    } else {
      setAttributes(samAttr);
    }
    if (samRG != null) {
      GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG);
      setReadGroup(rg);
    }

    super.setFileSource(read.getFileSource());
    super.setReadName(read.getReadName());
    super.setCigarString(read.getCigarString());
    super.setReadBases(read.getReadBases());
    super.setBaseQualities(read.getBaseQualities());
    // From SAMRecord constructor: Do this after the above because setCigarString will clear it.
    GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read));
  }
Пример #3
0
 private boolean isSvCandidate(SAMRecord read) {
   boolean isCandidate = false;
   if (!read.getProperPairFlag() && !read.getMateUnmappedFlag()) {
     if (!read.getReferenceName().equals(read.getMateReferenceName())) {
       isCandidate = true;
     } else if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart())
         > CombineChimera3.MAX_GAP_LENGTH) {
       isCandidate = true;
     }
   }
   return isCandidate;
 }
Пример #4
0
  public String assembleContigs(
      List<String> inputFiles,
      String output,
      String tempDir,
      List<Feature> regions,
      String prefix,
      boolean checkForDupes,
      ReAligner realigner,
      CompareToReference2 c2r) {

    if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) {
      KmerSizeEvaluator kmerEval = new KmerSizeEvaluator();
      int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions);
      this.kmers = realigner.toKmerArray(kmer, readLength);
    }

    String contigs = "";

    long start = System.currentTimeMillis();

    int readCount = 0;

    int minReadCount = Integer.MAX_VALUE;

    // if c2r is null, this is the unaligned region.
    boolean isAssemblyCandidate = c2r == null ? true : false;

    try {

      List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner);

      for (List<SAMRecord> reads : readsList) {
        int candidateReadCount = 0;
        for (SAMRecord read : reads) {
          if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) {
            candidateReadCount++;
          }

          if (shouldSearchForSv && isSvCandidate(read)) {
            svCandidates.add(
                new Position(read.getMateReferenceName(), read.getMateAlignmentStart()));
          }
        }

        if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) {
          isAssemblyCandidate = true;
        }

        if (reads.size() < minReadCount) {
          minReadCount = reads.size();
        }

        readCount += reads.size();
      }

      StringBuffer readBuffer = new StringBuffer();

      if (isAssemblyCandidate) {

        int downsampleTarget = desiredNumberOfReads(regions);

        char sampleId = 1;

        for (List<SAMRecord> reads : readsList) {
          // Default to always keep
          double keepProbability = 1.1;

          if (reads.size() > downsampleTarget) {
            keepProbability = (double) downsampleTarget / (double) reads.size();
          }

          Random random = new Random(1);

          for (SAMRecord read : reads) {
            if (random.nextDouble() < keepProbability) {
              readBuffer.append(sampleId);
              readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");

              if (read.getReadString().length() == readLength) {
                readBuffer.append(read.getReadString());
                readBuffer.append(read.getBaseQualityString());
              } else {
                StringBuffer basePadding = new StringBuffer();
                StringBuffer qualPadding = new StringBuffer();

                for (int i = 0; i < readLength - read.getReadString().length(); i++) {
                  basePadding.append('N');
                  qualPadding.append('!');
                }

                readBuffer.append(read.getReadString() + basePadding.toString());
                readBuffer.append(read.getBaseQualityString() + qualPadding.toString());
              }
            }
          }

          // Make this set of reads eligible for GC
          reads.clear();
          sampleId += 1;
        }
      }

      readsList.clear();

      if (isAssemblyCandidate) {
        for (int kmer : kmers) {

          String outputFile = output + "_k" + kmer;

          contigs =
              assemble(
                  readBuffer.toString(),
                  outputFile,
                  prefix,
                  truncateOnRepeat ? 1 : 0,
                  maxContigs,
                  maxPathsFromRoot,
                  readLength,
                  kmer,
                  minKmerFrequency,
                  minBaseQuality,
                  minEdgeRatio,
                  isDebug ? 1 : 0,
                  maxNodes);

          if (!contigs.equals("<REPEAT>")) {
            break;
          } else {
            if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) {
              isCycleExceedingThresholdDetected = true;
            }
          }
        }
      } else {
        //				System.out.println("Skipping assembly for: " + prefix);
      }

    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }

    if (this.shouldSearchForSv) {

      Collections.sort(this.svCandidates);
      Position last = null;
      String currentFeatureChr = null;
      int currentFeatureStart = -1;
      int currentFeatureStop = -1;
      int currentFeatureCount = 0;

      // TODO: Calc this dynamically
      int windowSize = 500;

      for (Position pos : this.svCandidates) {
        if ((last != null)
            && pos.getChromosome().equals(last.getChromosome())
            && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) {

          if (currentFeatureChr == null) {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = last.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          } else {
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount++;
          }
        } else {
          if (currentFeatureChr != null) {
            if (currentFeatureCount
                > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
              Feature region =
                  new Feature(
                      currentFeatureChr,
                      currentFeatureStart - readLength,
                      currentFeatureStop + readLength);
              BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
              this.svCandidateRegions.add(candidate);
            }
            currentFeatureChr = null;
            currentFeatureStart = -1;
            currentFeatureStop = -1;
            currentFeatureCount = 0;
          } else {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = pos.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          }
        }
        last = pos;
      }

      // Don't forget last SV candidate region
      if (currentFeatureCount
          > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
        Feature region =
            new Feature(
                currentFeatureChr,
                currentFeatureStart - readLength,
                currentFeatureStop + readLength);
        BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
        this.svCandidateRegions.add(candidate);
      }
    }

    long end = System.currentTimeMillis();

    int kmer = readLength + 1;
    if (kmers.length > 0) {
      kmer = kmers[0];
    }

    if (isDebug) {
      System.err.println(
          "Elapsed_msecs_in_NativeAssembler\tRegion:\t"
              + regions.get(0).getDescriptor()
              + "\tLength:\t"
              + regions.get(0).getLength()
              + "\tReadCount:\t"
              + readCount
              + "\tElapsed\t"
              + (end - start)
              + "\tAssembled\t"
              + isAssemblyCandidate
              + "\t"
              + kmer);
    }

    return contigs;
  }