Пример #1
0
  @Test(dataProvider = "loadReadsADAM", groups = "spark")
  public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
    // Since the test requires that we not create the actual output directory in advance,
    // we instead create its parent directory and mark it for deletion on exit. This protects
    // us from naming collisions across multiple instances of the test suite.
    final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
    final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);

    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());

    // Test the round trip
    List<GATKRead> samList = rddParallelReads.collect();
    List<GATKRead> adamList = rddParallelReads2.collect();
    Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
    samList.sort(comparator);
    adamList.sort(comparator);
    for (int i = 0; i < samList.size(); i++) {
      SAMRecord expected = samList.get(i).convertToSAMRecord(header);
      SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
      // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM ->
      // BAM
      // see https://github.com/bigdatagenomics/adam/issues/823
      Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
      Assert.assertEquals(
          observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
      Assert.assertEquals(
          observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
      Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
      Assert.assertEquals(
          observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
      Assert.assertEquals(
          observed.getMateAlignmentStart(),
          expected.getMateAlignmentStart(),
          "getMateAlignmentStart");
      Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
    }
  }
Пример #2
0
  /**
   * @param read a read containing the variant
   * @return the number of hard clipped and low qual bases at the read start (where start is the
   *     leftmost end w.r.t. the reference)
   */
  public static int getNumClippedBasesAtStart(final SAMRecord read) {
    // check for hard clips (never consider these bases):
    final Cigar c = read.getCigar();
    final CigarElement first = c.getCigarElement(0);

    int numStartClippedBases = 0;
    if (first.getOperator() == CigarOperator.H) {
      numStartClippedBases = first.getLength();
    }
    final byte[] unclippedReadBases = read.getReadBases();
    final byte[] unclippedReadQuals = read.getBaseQualities();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    // TODO: this code may not even get used because HaplotypeCaller already hard clips low quality
    // tails
    for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
      if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
        numStartClippedBases++;
      else break;
    }

    return numStartClippedBases;
  }
Пример #3
0
  /**
   * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY
   *
   * @param read
   */
  public GATKSAMRecord(final SAMRecord read) {
    super(read.getHeader());
    super.setReferenceIndex(read.getReferenceIndex());
    super.setAlignmentStart(read.getAlignmentStart());
    super.setReadName(read.getReadName());
    super.setMappingQuality(read.getMappingQuality());
    // indexing bin done below
    super.setCigar(read.getCigar());
    super.setFlags(read.getFlags());
    super.setMateReferenceIndex(read.getMateReferenceIndex());
    super.setMateAlignmentStart(read.getMateAlignmentStart());
    super.setInferredInsertSize(read.getInferredInsertSize());
    SAMReadGroupRecord samRG = read.getReadGroup();
    SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read);
    if (samAttr == null) {
      clearAttributes();
    } else {
      setAttributes(samAttr);
    }
    if (samRG != null) {
      GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG);
      setReadGroup(rg);
    }

    super.setFileSource(read.getFileSource());
    super.setReadName(read.getReadName());
    super.setCigarString(read.getCigarString());
    super.setReadBases(read.getReadBases());
    super.setBaseQualities(read.getBaseQualities());
    // From SAMRecord constructor: Do this after the above because setCigarString will clear it.
    GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read));
  }
Пример #4
0
  // we need to pad ref by at least the bandwidth / 2 on either side
  public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
    // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
    Pair<Integer, Integer> queryRange = calculateQueryRange(read);
    if (queryRange == null) return null; // read has Ns, or is completely clipped away

    int queryStart = queryRange.getFirst();
    int queryEnd = queryRange.getSecond();

    BAQCalculationResult baqResult =
        calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);

    // cap quals
    int readI = 0, refI = 0;
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      int l = elt.getLength();
      switch (elt.getOperator()) {
        case N: // cannot handle these
          return null;
        case H:
        case P: // ignore pads and hard clips
          break;
        case S:
          refI += l; // move the reference too, in addition to I
        case I:
          // todo -- is it really the case that we want to treat I and S the same?
          for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
          readI += l;
          break;
        case D:
          refI += l;
          break;
        case M:
          for (int i = readI; i < readI + l; i++) {
            int expectedPos = refI - refOffset + (i - readI);
            baqResult.bq[i] =
                capBaseByBAQ(
                    baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
          }
          readI += l;
          refI += l;
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }
    if (readI != read.getReadLength()) // odd cigar string
    System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);

    return baqResult;
  }
Пример #5
0
  /**
   * Determine the appropriate start and stop offsets in the reads for the bases given the cigar
   * string
   *
   * @param read
   * @return
   */
  private final Pair<Integer, Integer> calculateQueryRange(SAMRecord read) {
    int queryStart = -1, queryStop = -1;
    int readI = 0;

    // iterate over the cigar elements to determine the start and stop of the read bases for the BAQ
    // calculation
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      switch (elt.getOperator()) {
        case N:
          return null; // cannot handle these
        case H:
        case P:
        case D:
          break; // ignore pads, hard clips, and deletions
        case I:
        case S:
        case M:
        case EQ:
        case X:
          int prev = readI;
          readI += elt.getLength();
          if (includeClippedBases || elt.getOperator() != CigarOperator.S) {
            if (queryStart == -1) queryStart = prev;
            queryStop = readI;
          }
          // in the else case we aren't including soft clipped bases, so we don't update
          // queryStart or queryStop
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }

    if (queryStop == queryStart) {
      // this read is completely clipped away, and yet is present in the file for some reason
      // usually they are flagged as non-PF, but it's possible to push them through the BAM
      // System.err.printf("WARNING -- read is completely clipped away: " + read.format());
      return null;
    }

    return new Pair<Integer, Integer>(queryStart, queryStop);
  }
  public boolean filterOut(final SAMRecord read) {
    int alignedLength = 0;
    int softClipBlocks = 0;
    int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2;
    CigarOperator lastOperator = null;

    for (final CigarElement element : read.getCigar().getCigarElements()) {
      if (element.getOperator() == CigarOperator.S) {
        // Treat consecutive S blocks as a single one
        if (lastOperator != CigarOperator.S) {
          softClipBlocks += 1;
        }

      } else if (element
          .getOperator()
          .consumesReadBases()) { // M, I, X, and EQ (S was already accounted for above)
        alignedLength += element.getLength();
      }
      lastOperator = element.getOperator();
    }

    return (alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks);
  }
Пример #7
0
  /**
   * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no
   * BAM record was found.
   */
  public long guessNextBAMRecordStart(long beg, long end) throws IOException {
    // Buffer what we need to go through.

    byte[] arr = new byte[MAX_BYTES_READ];

    this.inFile.seek(beg);
    int totalRead = 0;
    for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) {
      final int r = inFile.read(arr, totalRead, left);
      if (r < 0) break;
      totalRead += r;
      left -= r;
    }
    arr = Arrays.copyOf(arr, totalRead);

    this.in = new SeekableArrayStream(arr);

    this.bgzf = new BlockCompressedInputStream(this.in);
    this.bgzf.setCheckCrcs(true);

    this.bamCodec.setInputStream(bgzf);

    final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff);

    // cp: Compressed Position, indexes the entire BGZF input.
    for (int cp = 0; ; ++cp) {
      final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
      if (psz == null) return end;

      final int cp0 = cp = psz.pos;
      final long cp0Virt = (long) cp0 << 16;
      try {
        bgzf.seek(cp0Virt);

        // This has to catch Throwable, because it's possible to get an
        // OutOfMemoryError due to an overly large size.
      } catch (Throwable e) {
        // Guessed BGZF position incorrectly: try the next guess.
        continue;
      }

      // up: Uncompressed Position, indexes the data inside the BGZF block.
      for (int up = 0; ; ++up) {
        final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);

        if (up0 < 0) {
          // No BAM records found in the BGZF block: try the next BGZF
          // block.
          break;
        }

        // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
        // of records starting at (cp0,up0).
        bgzf.seek(cp0Virt | up0);
        boolean decodedAny = false;
        try {
          byte b = 0;
          int prevCP = cp0;
          while (b < BLOCKS_NEEDED_FOR_GUESS) {
            SAMRecord record = bamCodec.decode();
            if (record == null) {
              break;
            }
            record.getCigar(); // force decoding of CIGAR
            decodedAny = true;

            final int cp2 = (int) (bgzf.getFilePointer() >>> 16);
            if (cp2 != prevCP) {
              // The compressed position changed so we must be in a new
              // block.
              assert cp2 > prevCP;
              prevCP = cp2;
              ++b;
            }
          }

          // Running out of records to verify is fine as long as we
          // verified at least something. It should only happen if we
          // couldn't fill the array.
          if (b < BLOCKS_NEEDED_FOR_GUESS) {
            assert arr.length < MAX_BYTES_READ;
            if (!decodedAny) continue;
          }
        } catch (SAMFormatException e) {
          continue;
        } catch (FileTruncatedException e) {
          continue;
        } catch (OutOfMemoryError e) {
          continue;
        } catch (IllegalArgumentException e) {
          continue;
        } catch (RuntimeIOException e) {
          continue;
        } catch (RuntimeEOFException e) {
          // This can happen legitimately if the [beg,end) range is too
          // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
          // off in the middle of a record. In that case, our stream
          // should have hit EOF as well. If we've then verified at least
          // something, go ahead with it and hope for the best.
          if (!decodedAny && this.in.eof()) continue;
        }

        return beg + cp0 << 16 | up0;
      }
    }
  }
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }