Esempio n. 1
0
  @Test(dataProvider = "loadReadsADAM", groups = "spark")
  public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
    // Since the test requires that we not create the actual output directory in advance,
    // we instead create its parent directory and mark it for deletion on exit. This protects
    // us from naming collisions across multiple instances of the test suite.
    final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
    final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);

    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());

    // Test the round trip
    List<GATKRead> samList = rddParallelReads.collect();
    List<GATKRead> adamList = rddParallelReads2.collect();
    Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
    samList.sort(comparator);
    adamList.sort(comparator);
    for (int i = 0; i < samList.size(); i++) {
      SAMRecord expected = samList.get(i).convertToSAMRecord(header);
      SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
      // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM ->
      // BAM
      // see https://github.com/bigdatagenomics/adam/issues/823
      Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
      Assert.assertEquals(
          observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
      Assert.assertEquals(
          observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
      Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
      Assert.assertEquals(
          observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
      Assert.assertEquals(
          observed.getMateAlignmentStart(),
          expected.getMateAlignmentStart(),
          "getMateAlignmentStart");
      Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
    }
  }
Esempio n. 2
0
  /** Note: this is the only getKey function that handles unmapped reads specially! */
  public static long getKey(final SAMRecord rec) {
    final int refIdx = rec.getReferenceIndex();
    final int start = rec.getAlignmentStart();

    if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start);

    // Put unmapped reads at the end, but don't give them all the exact same
    // key so that they can be distributed to different reducers.
    //
    // A random number would probably be best, but to ensure that the same
    // record always gets the same key we use a fast hash instead.
    //
    // We avoid using hashCode(), because it's not guaranteed to have the
    // same value across different processes.

    int hash = 0;
    byte[] var;
    if ((var = rec.getVariableBinaryRepresentation()) != null) {
      // Undecoded BAM record: just hash its raw data.
      hash = (int) MurmurHash3.murmurhash3(var, hash);
    } else {
      // Decoded BAM record or any SAM record: hash a few representative
      // fields together.
      hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash);
    }
    return getKey0(Integer.MAX_VALUE, hash);
  }
Esempio n. 3
0
  /**
   * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY
   *
   * @param read
   */
  public GATKSAMRecord(final SAMRecord read) {
    super(read.getHeader());
    super.setReferenceIndex(read.getReferenceIndex());
    super.setAlignmentStart(read.getAlignmentStart());
    super.setReadName(read.getReadName());
    super.setMappingQuality(read.getMappingQuality());
    // indexing bin done below
    super.setCigar(read.getCigar());
    super.setFlags(read.getFlags());
    super.setMateReferenceIndex(read.getMateReferenceIndex());
    super.setMateAlignmentStart(read.getMateAlignmentStart());
    super.setInferredInsertSize(read.getInferredInsertSize());
    SAMReadGroupRecord samRG = read.getReadGroup();
    SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read);
    if (samAttr == null) {
      clearAttributes();
    } else {
      setAttributes(samAttr);
    }
    if (samRG != null) {
      GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG);
      setReadGroup(rg);
    }

    super.setFileSource(read.getFileSource());
    super.setReadName(read.getReadName());
    super.setCigarString(read.getCigarString());
    super.setReadBases(read.getReadBases());
    super.setBaseQualities(read.getBaseQualities());
    // From SAMRecord constructor: Do this after the above because setCigarString will clear it.
    GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read));
  }
  private boolean isOutOfOrder(final SAMRecord last, final SAMRecord cur) {
    if (last == null || cur.getReadUnmappedFlag()) return false;
    else {
      if (last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
          || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
        throw new UserException.MalformedBAM(
            last, String.format("read %s has inconsistent mapping information.", last.format()));
      if (cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
          || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
        throw new UserException.MalformedBAM(
            last, String.format("read %s has inconsistent mapping information.", cur.format()));

      return (last.getReferenceIndex() > cur.getReferenceIndex())
          || (last.getReferenceIndex().equals(cur.getReferenceIndex())
              && last.getAlignmentStart() > cur.getAlignmentStart());
    }
  }
Esempio n. 5
0
 private boolean isSvCandidate(SAMRecord read) {
   boolean isCandidate = false;
   if (!read.getProperPairFlag() && !read.getMateUnmappedFlag()) {
     if (!read.getReferenceName().equals(read.getMateReferenceName())) {
       isCandidate = true;
     } else if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart())
         > CombineChimera3.MAX_GAP_LENGTH) {
       isCandidate = true;
     }
   }
   return isCandidate;
 }
 private void countRead(final SAMRecord record) {
   if (record != null) {
     readCount++;
     if (readCount % logSkipSize == 0) {
       logger.info("Single reads processed: " + readCount);
       logger.info(
           "Last read: "
               + record.getReferenceName()
               + ":"
               + record.getAlignmentStart()
               + "-"
               + record.getAlignmentEnd());
     }
   }
 }
Esempio n. 7
0
 private boolean overlaps(SAMRecord r) {
   if (intervals == null
       || (r.getReadUnmappedFlag() && r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)) {
     return true;
   }
   if (r.getReadUnmappedFlag()) { // special case for unmapped reads with coordinate set
     for (Locatable interval : intervals) {
       if (interval.getStart() <= r.getStart() && interval.getEnd() >= r.getStart()) {
         // This follows the behavior of htsjdk's SamReader which states that
         // "an unmapped read will be returned by this call if it has a coordinate for
         // the purpose of sorting that is in the query region".
         return true;
       }
     }
   }
   final Interval interval = new Interval(r.getContig(), r.getStart(), r.getEnd());
   Collection<Interval> overlaps = overlapDetector.getOverlaps(interval);
   return !overlaps.isEmpty();
 }
Esempio n. 8
0
  public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
    // start is alignment start - band width / 2 - size of first I element, if there is one.  Stop
    // is similar
    int offset = getBandWidth() / 2;
    long readStart = includeClippedBases ? read.getUnclippedStart() : read.getAlignmentStart();
    long start = Math.max(readStart - offset - ReadUtils.getFirstInsertionOffset(read), 0);
    long stop =
        (includeClippedBases ? read.getUnclippedEnd() : read.getAlignmentEnd())
            + offset
            + ReadUtils.getLastInsertionOffset(read);

    if (stop
        > refReader
            .getSequenceDictionary()
            .getSequence(read.getReferenceName())
            .getSequenceLength()) {
      return null;
    } else {
      // now that we have the start and stop, get the reference sequence covering it
      ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
      return calcBAQFromHMM(read, refSeq.getBases(), (int) (start - readStart));
    }
  }
Esempio n. 9
0
  public String simpleAssemble(List<SAMRecord> reads) {

    StringBuffer readBuffer = new StringBuffer();

    for (SAMRecord read : reads) {
      readBuffer.append((char) 1);
      readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");

      if (read.getReadString().length() == readLength) {
        readBuffer.append(read.getReadString());
        readBuffer.append(read.getBaseQualityString());
      } else {
        StringBuffer basePadding = new StringBuffer();
        StringBuffer qualPadding = new StringBuffer();

        for (int i = 0; i < readLength - read.getReadString().length(); i++) {
          basePadding.append('N');
          qualPadding.append('!');
        }

        readBuffer.append(read.getReadString() + basePadding.toString());
        readBuffer.append(read.getBaseQualityString() + qualPadding.toString());
      }
    }

    SAMRecord lastRead = reads.get(reads.size() - 1);
    int regionStart = reads.get(0).getAlignmentStart();
    int regionEnd =
        lastRead.getAlignmentEnd() > 0 ? lastRead.getAlignmentEnd() : lastRead.getAlignmentStart();

    String output =
        "region_" + reads.get(0).getReferenceName() + "_" + regionStart + "_" + regionEnd;
    String contigs = "";

    // Make this set of reads eligible for GC
    //		reads.clear();

    for (int kmer : kmers) {

      String outputFile = output + "_k" + kmer;

      contigs =
          assemble(
              readBuffer.toString(),
              outputFile,
              output,
              1, // truncate_on_repeat
              maxContigs,
              maxPathsFromRoot,
              readLength,
              kmer,
              minKmerFrequency,
              minBaseQuality,
              minEdgeRatio,
              isDebug ? 1 : 0,
              maxNodes);

      if (!contigs.equals("<REPEAT>")) {
        break;
      }
    }

    return contigs;
  }
  @Override
  public void execute() {
    log.info("Loading reads...");

    List<SAMRecord> reads = new ArrayList<SAMRecord>();

    for (SAMRecord sr : BAM) {
      if (!sr.getReadUnmappedFlag()) {
        reads.add(sr);
      }
    }

    log.info("  {} reads loaded", reads.size());

    Collections.shuffle(reads);

    log.info("  shuffled");

    DataTable dt =
        new DataTable(
            "LanderWaterman",
            "Realistic Lander-Waterman stats",
            "reads",
            "bp_used",
            "bp_total",
            "pct_genome");

    for (int numReads : NUM_READS) {
      log.info("Selecting {} reads...", numReads);

      Map<String, boolean[]> utilizationMask = new HashMap<String, boolean[]>();

      for (SAMSequenceRecord ssr : BAM.getFileHeader().getSequenceDictionary().getSequences()) {
        utilizationMask.put(ssr.getSequenceName(), new boolean[ssr.getSequenceLength()]);
      }

      for (int i = 0; i < numReads; i++) {
        SAMRecord read = reads.get(i);

        for (int j = read.getAlignmentStart(); j < read.getAlignmentEnd(); j++) {
          utilizationMask.get(read.getReferenceName())[j] = true;
        }
      }

      int basesUsed = 0, basesTotal = 0;

      for (String refName : utilizationMask.keySet()) {
        for (int i = 0; i < utilizationMask.get(refName).length; i++) {
          if (utilizationMask.get(refName)[i]) {
            basesUsed++;
          }

          basesTotal++;
        }
      }

      dt.set("lw" + numReads, "reads", numReads);
      dt.set("lw" + numReads, "bp_used", basesUsed);
      dt.set("lw" + numReads, "bp_total", basesTotal);
      dt.set("lw" + numReads, "pct_genome", (float) basesUsed / (float) basesTotal);

      log.info(
          "  reads: {}, bp_used: {}, bp_total: {}, pct_genome: {}",
          numReads,
          basesUsed,
          basesTotal,
          (float) basesUsed / (float) basesTotal);
    }

    out.println(dt);
  }
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }