Пример #1
0
  /**
   * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY
   *
   * @param read
   */
  public GATKSAMRecord(final SAMRecord read) {
    super(read.getHeader());
    super.setReferenceIndex(read.getReferenceIndex());
    super.setAlignmentStart(read.getAlignmentStart());
    super.setReadName(read.getReadName());
    super.setMappingQuality(read.getMappingQuality());
    // indexing bin done below
    super.setCigar(read.getCigar());
    super.setFlags(read.getFlags());
    super.setMateReferenceIndex(read.getMateReferenceIndex());
    super.setMateAlignmentStart(read.getMateAlignmentStart());
    super.setInferredInsertSize(read.getInferredInsertSize());
    SAMReadGroupRecord samRG = read.getReadGroup();
    SAMBinaryTagAndValue samAttr = GATKBin.getReadBinaryAttributes(read);
    if (samAttr == null) {
      clearAttributes();
    } else {
      setAttributes(samAttr);
    }
    if (samRG != null) {
      GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG);
      setReadGroup(rg);
    }

    super.setFileSource(read.getFileSource());
    super.setReadName(read.getReadName());
    super.setCigarString(read.getCigarString());
    super.setReadBases(read.getReadBases());
    super.setBaseQualities(read.getBaseQualities());
    // From SAMRecord constructor: Do this after the above because setCigarString will clear it.
    GATKBin.setReadIndexingBin(this, GATKBin.getReadIndexingBin(read));
  }
Пример #2
0
  @Test(dataProvider = "loadReadsADAM", groups = "spark")
  public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
    // Since the test requires that we not create the actual output directory in advance,
    // we instead create its parent directory and mark it for deletion on exit. This protects
    // us from naming collisions across multiple instances of the test suite.
    final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
    final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);

    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null);
    SAMFileHeader header = ReadsSparkSource.getHeader(ctx, inputBam, null);

    ReadsSparkSink.writeReads(
        ctx, outputDirectory.getAbsolutePath(), rddParallelReads, header, ReadsWriteFormat.ADAM);

    JavaRDD<GATKRead> rddParallelReads2 =
        readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
    Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());

    // Test the round trip
    List<GATKRead> samList = rddParallelReads.collect();
    List<GATKRead> adamList = rddParallelReads2.collect();
    Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
    samList.sort(comparator);
    adamList.sort(comparator);
    for (int i = 0; i < samList.size(); i++) {
      SAMRecord expected = samList.get(i).convertToSAMRecord(header);
      SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
      // manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM ->
      // BAM
      // see https://github.com/bigdatagenomics/adam/issues/823
      Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
      Assert.assertEquals(
          observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
      Assert.assertEquals(
          observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
      Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
      Assert.assertEquals(
          observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
      Assert.assertEquals(
          observed.getMateAlignmentStart(),
          expected.getMateAlignmentStart(),
          "getMateAlignmentStart");
      Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
    }
  }
Пример #3
0
  /**
   * Returns a new qual array for read that includes the BAQ adjustment. Does not support on-the-fly
   * BAQ calculation
   *
   * @param read the SAMRecord to operate on
   * @param overwriteOriginalQuals If true, we replace the original qualities scores in the read
   *     with their BAQ'd version
   * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ
   *     annotation we just use the raw quality scores. Throws IllegalStateException is false and no
   *     BAQ tag is present
   * @return
   */
  public static byte[] calcBAQFromTag(
      SAMRecord read, boolean overwriteOriginalQuals, boolean useRawQualsIfNoBAQTag) {
    byte[] rawQuals = read.getBaseQualities();
    byte[] newQuals = rawQuals;
    byte[] baq = getBAQTag(read);

    if (baq != null) {
      // Offset to base alignment quality (BAQ), of the same length as the read sequence.
      // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
      newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
      for (int i = 0; i < rawQuals.length; i++) {
        int rawQual = (int) rawQuals[i];
        int baq_delta = (int) baq[i] - 64;
        int newval = rawQual - baq_delta;
        if (newval < 0)
          throw new UserException.MalformedBAM(
              read, "BAQ tag error: the BAQ value is larger than the base quality");
        newQuals[i] = (byte) newval;
      }
    } else if (!useRawQualsIfNoBAQTag) {
      throw new IllegalStateException(
          "Required BAQ tag to be present, but none was on read " + read.getReadName());
    }

    return newQuals;
  }
Пример #4
0
  /** Note: this is the only getKey function that handles unmapped reads specially! */
  public static long getKey(final SAMRecord rec) {
    final int refIdx = rec.getReferenceIndex();
    final int start = rec.getAlignmentStart();

    if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) return getKey(refIdx, start);

    // Put unmapped reads at the end, but don't give them all the exact same
    // key so that they can be distributed to different reducers.
    //
    // A random number would probably be best, but to ensure that the same
    // record always gets the same key we use a fast hash instead.
    //
    // We avoid using hashCode(), because it's not guaranteed to have the
    // same value across different processes.

    int hash = 0;
    byte[] var;
    if ((var = rec.getVariableBinaryRepresentation()) != null) {
      // Undecoded BAM record: just hash its raw data.
      hash = (int) MurmurHash3.murmurhash3(var, hash);
    } else {
      // Decoded BAM record or any SAM record: hash a few representative
      // fields together.
      hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash);
      hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash);
    }
    return getKey0(Integer.MAX_VALUE, hash);
  }
Пример #5
0
  private String getIdentifier(SAMRecord read) {
    String id = read.getReadName();

    if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
      id += "_2";
    }

    return id;
  }
  @Override
  public SAMRecordPair getNextReadPair() {
    // insert first read into dictionary by queryname
    // insert second read into dictionary
    // check if the dictionary length for that entry has both pairs
    // if it is does return the read pair
    // otherwise continue reading
    // this way just return pairs as they are completed
    // should be MUCH faster
    // make sure to delete the entry after returning so that we dont have a memory leak

    if (iterator.hasNext()) {
      while (iterator.hasNext()) {
        SAMRecord record = iterator.next();
        countRead(record);
        // skip if the read is unmapped, not properly paired or mate is unmapped
        if (record.getReadUnmappedFlag() == true
            || record.getProperPairFlag() == false
            || record.getMateUnmappedFlag() == true) {
          continue;
        }

        String query = record.getReadName();

        // check if read mate has been read already
        if (readBuffer.containsKey(query)) {
          // if it has then return the pair
          SAMRecordPair pair = readBuffer.get(query);
          pair.addPair(record);
          if (pair.bothPairsAligned() && pair.isValidPair()) {
            // prevent memory leak by deleting keys that are no longer needed
            readBuffer.remove(query);
            return pair;
          } else {
            throw new RuntimeException(query + " is not properly mated");
          }
        } else {
          // otherwise create an entry and store it by its query name
          SAMRecordPair pair = new SAMRecordPair();
          pair.addPair(record);
          readBuffer.put(query, pair);
        }
      }
    } else {
      if (readBuffer.size() > 0) {
        for (String key : readBuffer.keySet()) {
          logger.info("No mate for for " + key);
        }
        throw new RuntimeException(
            "No mates found for some reads please make sure all reads are properly paired");
      }
    }

    return null;
  }
Пример #7
0
  // we need to pad ref by at least the bandwidth / 2 on either side
  public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
    // todo -- need to handle the case where the cigar sum of lengths doesn't cover the whole read
    Pair<Integer, Integer> queryRange = calculateQueryRange(read);
    if (queryRange == null) return null; // read has Ns, or is completely clipped away

    int queryStart = queryRange.getFirst();
    int queryEnd = queryRange.getSecond();

    BAQCalculationResult baqResult =
        calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);

    // cap quals
    int readI = 0, refI = 0;
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      int l = elt.getLength();
      switch (elt.getOperator()) {
        case N: // cannot handle these
          return null;
        case H:
        case P: // ignore pads and hard clips
          break;
        case S:
          refI += l; // move the reference too, in addition to I
        case I:
          // todo -- is it really the case that we want to treat I and S the same?
          for (int i = readI; i < readI + l; i++) baqResult.bq[i] = baqResult.rawQuals[i];
          readI += l;
          break;
        case D:
          refI += l;
          break;
        case M:
          for (int i = readI; i < readI + l; i++) {
            int expectedPos = refI - refOffset + (i - readI);
            baqResult.bq[i] =
                capBaseByBAQ(
                    baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos);
          }
          readI += l;
          refI += l;
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }
    if (readI != read.getReadLength()) // odd cigar string
    System.arraycopy(baqResult.rawQuals, 0, baqResult.bq, 0, baqResult.bq.length);

    return baqResult;
  }
Пример #8
0
  /**
   * Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses
   * the BAQ tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and
   * does the BAQ on the fly using the refReader to obtain the reference bases as needed.
   *
   * @param read
   * @param refReader
   * @param calculationType
   * @return BQ qualities for use, in case qmode is DONT_MODIFY
   */
  public byte[] baqRead(
      SAMRecord read,
      IndexedFastaSequenceFile refReader,
      CalculationMode calculationType,
      QualityMode qmode) {
    if (DEBUG) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());

    byte[] BAQQuals =
        read.getBaseQualities(); // in general we are overwriting quals, so just get a pointer to
    // them
    if (calculationType == CalculationMode.OFF) { // we don't want to do anything
      ; // just fall though
    } else if (excludeReadFromBAQ(read)) {; // just fall through
    } else {
      final boolean readHasBAQTag = hasBAQTag(read);

      if (calculationType == CalculationMode.RECALCULATE || !readHasBAQTag) {
        if (DEBUG) System.out.printf("  Calculating BAQ on the fly%n");
        BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
        if (hmmResult != null) {
          switch (qmode) {
            case ADD_TAG:
              addBAQTag(read, hmmResult.bq);
              break;
            case OVERWRITE_QUALS:
              System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
              break;
            case DONT_MODIFY:
              BAQQuals = hmmResult.bq;
              break;
            default:
              throw new ReviewedGATKException("BUG: unexpected qmode " + qmode);
          }
        } else if (readHasBAQTag) {
          // remove the BAQ tag if it's there because we cannot trust it
          read.setAttribute(BAQ_TAG, null);
        }
      } else if (qmode
          == QualityMode.OVERWRITE_QUALS) { // only makes sense if we are overwriting quals
        if (DEBUG) System.out.printf("  Taking BAQ from tag%n");
        // this overwrites the original qualities
        calcBAQFromTag(read, true, false);
      }
    }

    return BAQQuals;
  }
Пример #9
0
  /**
   * Determine the appropriate start and stop offsets in the reads for the bases given the cigar
   * string
   *
   * @param read
   * @return
   */
  private final Pair<Integer, Integer> calculateQueryRange(SAMRecord read) {
    int queryStart = -1, queryStop = -1;
    int readI = 0;

    // iterate over the cigar elements to determine the start and stop of the read bases for the BAQ
    // calculation
    for (CigarElement elt : read.getCigar().getCigarElements()) {
      switch (elt.getOperator()) {
        case N:
          return null; // cannot handle these
        case H:
        case P:
        case D:
          break; // ignore pads, hard clips, and deletions
        case I:
        case S:
        case M:
        case EQ:
        case X:
          int prev = readI;
          readI += elt.getLength();
          if (includeClippedBases || elt.getOperator() != CigarOperator.S) {
            if (queryStart == -1) queryStart = prev;
            queryStop = readI;
          }
          // in the else case we aren't including soft clipped bases, so we don't update
          // queryStart or queryStop
          break;
        default:
          throw new ReviewedGATKException(
              "BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
      }
    }

    if (queryStop == queryStart) {
      // this read is completely clipped away, and yet is present in the file for some reason
      // usually they are flagged as non-PF, but it's possible to push them through the BAM
      // System.err.printf("WARNING -- read is completely clipped away: " + read.format());
      return null;
    }

    return new Pair<Integer, Integer>(queryStart, queryStop);
  }
Пример #10
0
  /**
   * Returns the BAQ adjusted quality score for this read at this offset. Does not support
   * on-the-fly BAQ calculation
   *
   * @param read the SAMRecord to operate on
   * @param offset the offset of operate on
   * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ
   *     annotation we just use the raw quality scores. Throws IllegalStateException is false and no
   *     BAQ tag is present
   * @return
   */
  public static byte calcBAQFromTag(SAMRecord read, int offset, boolean useRawQualsIfNoBAQTag) {
    byte rawQual = read.getBaseQualities()[offset];
    byte newQual = rawQual;
    byte[] baq = getBAQTag(read);

    if (baq != null) {
      // Offset to base alignment quality (BAQ), of the same length as the read sequence.
      // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
      int baq_delta = (int) baq[offset] - 64;
      int newval = rawQual - baq_delta;
      if (newval < 0)
        throw new UserException.MalformedBAM(
            read, "BAQ tag error: the BAQ value is larger than the base quality");
      newQual = (byte) newval;

    } else if (!useRawQualsIfNoBAQTag) {
      throw new IllegalStateException(
          "Required BAQ tag to be present, but none was on read " + read.getReadName());
    }

    return newQual;
  }
  public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {

    long numStartClippedBases = 0;
    long numEndClippedBases = 0;

    byte[] unclippedReadQuals = read.getBaseQualities();
    byte[] unclippedReadBases = read.getReadBases();

    // Do a stricter base clipping than provided by CIGAR string, since this one may be too
    // conservative,
    // and may leave a string of Q2 bases still hanging off the reads.
    for (int i = 0; i < read.getReadLength(); i++) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numStartClippedBases++;
      else break;
    }
    for (int i = read.getReadLength() - 1; i >= 0; i--) {
      if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) numEndClippedBases++;
      else break;
    }
    // System.out.format("numstart: %d numend: %d\n", numStartClippedBases, numEndClippedBases);
    if (numStartClippedBases + numEndClippedBases >= read.getReadLength()) {
      return 0; /// Double.POSITIVE_INFINITY;
    }
    byte[] readBases =
        Arrays.copyOfRange(
            unclippedReadBases,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    byte[] readQuals =
        Arrays.copyOfRange(
            unclippedReadQuals,
            (int) numStartClippedBases,
            (int) (read.getReadBases().length - numEndClippedBases));

    int readLength = readBases.length;

    // initialize path metric and traceback memories for Viterbi computation
    pathMetricArray = new double[readLength + 1][PATH_METRIC_TABLE_LENGTH];
    bestStateIndexArray = new int[readLength + 1][PATH_METRIC_TABLE_LENGTH];

    for (int k = 1; k < PATH_METRIC_TABLE_LENGTH; k++) pathMetricArray[0][k] = 0;

    /*

     if (doSimpleCalculationModel) {

        // No Viterbi algorithm - assume no sequencing indel artifacts,

        // so we can collapse computations and pr(read | haplotype) is just probability of observing overlap
        // of read with haplotype.
        int haplotypeIndex = initialIndexInHaplotype;
        double c =  0.0;//deletionErrorProbabilities[1] +logOneMinusInsertionStartProbability;
        // compute likelihood of portion of base to the left of the haplotype
        for (int indR=readStartIdx-1; indR >= 0; indR--) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];
            if (readQual <= 2)
                continue;
            double pBaseRead = getProbabilityOfReadBaseGivenXandI((byte)0, readBase, readQual, LEFT_ALIGN_INDEX, 0);

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            pRead += pBaseRead;

        }
        //System.out.format("\nSt: %d Pre-Likelihood:%f\n",readStartIdx, pRead);

        for (int indR=readStartIdx; indR < readBases.length; indR++) {
            byte readBase = readBases[indR];
            byte readQual = readQuals[indR];

            byte haplotypeBase;
            if (haplotypeIndex < RIGHT_ALIGN_INDEX)
                haplotypeBase = haplotype.getBases()[haplotypeIndex];
            else
                haplotypeBase = (byte)0; // dummy

            double pBaseRead = getProbabilityOfReadBaseGivenXandI(haplotypeBase, readBase, readQual, haplotypeIndex, 0);
            if (haplotypeBase != 0)
                pBaseRead += c;

            // pBaseRead has -10*log10(Prob(base[i]|haplotype[i])
            if (readQual > 3)
                pRead += pBaseRead;
            haplotypeIndex++;
            if (haplotypeIndex >= haplotype.getBases().length)
                haplotypeIndex = RIGHT_ALIGN_INDEX;
            //System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead);
         }
        //System.out.format("\nSt: %d Post-Likelihood:%f\n",readStartIdx, pRead);

        if (DEBUG) {
            System.out.println(read.getReadName());
            System.out.print("Haplotype:");

            for (int k=0; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.print("Read bases: ");
            for (int k=0; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }
            System.out.format("\nLikelihood:%f\n",pRead);

        }

        if (read.getReadName().contains("106880")) {

            System.out.println("aca");

            System.out.println("Haplotype:");

            for (int k=initialIndexInHaplotype; k <haplotype.getBases().length; k++) {
                System.out.format("%c ", haplotype.getBases()[k]);
            }
            System.out.println();

            System.out.println("Read bases: ");
            for (int k=readStartIdx; k <readBases.length; k++) {
                System.out.format("%c ", readBases[k]);
            }

        }
        return pRead;

    }
            */

    // Update path metric computations based on branch metric (Add/Compare/Select operations)
    // do forward direction first, ie from anchor to end of read
    // outer loop
    for (int indR = 0; indR < readLength; indR++) {
      byte readBase = readBases[indR];
      byte readQual = readQuals[indR];

      for (int indX = LEFT_ALIGN_INDEX; indX <= RIGHT_ALIGN_INDEX; indX++) {

        byte haplotypeBase;
        if (indX > LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX)
          haplotypeBase = haplotype.getBases()[indX - 1];
        else haplotypeBase = readBase;

        updatePathMetrics(haplotypeBase, indX, indR, readBase, readQual);
      }
    }

    // for debugging only: compute backtracking to find optimal route through trellis. Since I'm
    // only interested
    // in log-likelihood of best state, this isn't really necessary.
    double bestMetric = MathUtils.arrayMin(pathMetricArray[readLength]);

    if (DEBUG) {

      System.out.println(read.getReadName());
      System.out.print("Haplotype:");

      for (int k = 0; k < haplotype.getBases().length; k++) {
        System.out.format("%c ", haplotype.getBases()[k]);
      }
      System.out.println();

      System.out.print("Read bases: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%c ", readBases[k]);
      }
      System.out.println();

      System.out.print("Read quals: ");
      for (int k = 0; k < readQuals.length; k++) {
        System.out.format("%d ", (int) readQuals[k]);
      }
      System.out.println();

      // start from last position of read, go backwards to find optimal alignment
      int[] bestIndexArray = new int[readLength];
      int bestIndex = MathUtils.minElementIndex(pathMetricArray[readLength]);
      bestIndexArray[readLength - 1] = bestIndex;

      for (int k = readLength - 2; k >= 0; k--) {
        bestIndex = bestStateIndexArray[k][bestIndex];
        bestIndexArray[k] = bestIndex;
      }

      System.out.print("Alignment: ");
      for (int k = 0; k < readBases.length; k++) {
        System.out.format("%d ", bestIndexArray[k]);
      }
      System.out.println();
    }
    // now just take optimum along all path metrics: that's the log likelihood of best alignment
    if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric);
    return bestMetric;
  }
Пример #12
0
  /**
   * Test that PG header records are created & chained appropriately (or not created), and that the
   * PG record chains are as expected. MarkDuplicates is used both to merge and to mark dupes in
   * this case.
   *
   * @param suppressPg If true, do not create PG header record.
   * @param expectedPnVnByReadName For each read, info about the expect chain of PG records.
   */
  @Test(dataProvider = "pgRecordChainingTest")
  public void pgRecordChainingTest(
      final boolean suppressPg, final Map<String, List<ExpectedPnAndVn>> expectedPnVnByReadName) {
    final File outputDir = IOUtil.createTempDir(TEST_BASE_NAME + ".", ".tmp");
    outputDir.deleteOnExit();
    try {
      // Run MarkDuplicates, merging the 3 input files, and either enabling or suppressing PG header
      // record creation according to suppressPg.
      final MarkDuplicates markDuplicates = new MarkDuplicates();
      final ArrayList<String> args = new ArrayList<String>();
      for (int i = 1; i <= 3; ++i) {
        args.add("INPUT=" + new File(TEST_DATA_DIR, "merge" + i + ".sam").getAbsolutePath());
      }
      final File outputSam = new File(outputDir, TEST_BASE_NAME + ".sam");
      args.add("OUTPUT=" + outputSam.getAbsolutePath());
      args.add(
          "METRICS_FILE="
              + new File(outputDir, TEST_BASE_NAME + ".duplicate_metrics").getAbsolutePath());
      if (suppressPg) args.add("PROGRAM_RECORD_ID=null");

      // I generally prefer to call doWork rather than invoking the argument parser, but it is
      // necessary
      // in this case to initialize the command line.
      // Note that for the unit test, version won't come through because it is obtained through jar
      // manifest, and unit test doesn't run code from a jar.
      Assert.assertEquals(markDuplicates.instanceMain(args.toArray(new String[args.size()])), 0);

      // Read the MarkDuplicates output file, and get the PG ID for each read.  In this particular
      // test,
      // the PG ID should be the same for both ends of a pair.
      final SamReader reader = SamReaderFactory.makeDefault().open(outputSam);

      final Map<String, String> pgIdForReadName = new HashMap<String, String>();
      for (final SAMRecord rec : reader) {
        final String existingPgId = pgIdForReadName.get(rec.getReadName());
        final String thisPgId = rec.getStringAttribute(SAMTag.PG.name());
        if (existingPgId != null) {
          Assert.assertEquals(thisPgId, existingPgId);
        } else {
          pgIdForReadName.put(rec.getReadName(), thisPgId);
        }
      }
      final SAMFileHeader header = reader.getFileHeader();
      CloserUtil.close(reader);

      // Confirm that for each read name, the chain of PG records contains exactly the number that
      // is expected,
      // and that values in the PG chain are as expected.
      for (final Map.Entry<String, List<ExpectedPnAndVn>> entry :
          expectedPnVnByReadName.entrySet()) {
        final String readName = entry.getKey();
        final List<ExpectedPnAndVn> expectedList = entry.getValue();
        String pgId = pgIdForReadName.get(readName);
        for (final ExpectedPnAndVn expected : expectedList) {
          final SAMProgramRecord programRecord = header.getProgramRecord(pgId);
          if (expected.expectedPn != null)
            Assert.assertEquals(programRecord.getProgramName(), expected.expectedPn);
          if (expected.expectedVn != null)
            Assert.assertEquals(programRecord.getProgramVersion(), expected.expectedVn);
          pgId = programRecord.getPreviousProgramGroupId();
        }
        Assert.assertNull(pgId);
      }

    } finally {
      TestUtil.recursiveDelete(outputDir);
    }
  }
Пример #13
0
  @Test
  public void testWithIndividualReadBarcodes() {
    final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
    final String readNameOne = "RUNID:1:1:15993:13361";
    final String readNameTwo = "RUNID:2:2:15993:13362";
    final String readNameThree = "RUNID:3:3:15993:13362";

    // first two reads have the same barcode (all three), third read has a different barcode for the
    // second end
    tester.addMatePair(
        readNameOne,
        2,
        41212324,
        41212310,
        false,
        false,
        false,
        false,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY);
    tester.addMatePair(
        readNameTwo,
        2,
        41212324,
        41212310,
        false,
        false,
        true,
        true,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY); // same barcode as the first
    tester.addMatePair(
        readNameThree,
        2,
        41212324,
        41212310,
        false,
        false,
        false,
        false,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY);

    final String barcodeTag = "BC";
    final String readOneBarcodeTag =
        "BX"; // want the same tag as the second end, since this is allowed
    final String readTwoBarcodeTag = "BX";
    for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
      record.setAttribute(barcodeTag, "Barcode1"); // same barcode
      if (record.getFirstOfPairFlag()) { // always the same value for the first end
        record.setAttribute(readOneBarcodeTag, "readOne1");
      } else { // second end
        if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) {
          record.setAttribute(readTwoBarcodeTag, "readTwo1");
        } else if (record.getReadName().equals(readNameThree)) {
          record.setAttribute(readTwoBarcodeTag, "readTwo2");
        }
      }
    }
    tester.addArg("BARCODE_TAG=" + barcodeTag);
    tester.addArg("READ_ONE_BARCODE_TAG=" + readOneBarcodeTag);
    tester.addArg("READ_TWO_BARCODE_TAG=" + readTwoBarcodeTag);

    tester.runTest();
  }
Пример #14
0
  @Test
  public void testWithBarcodeComplex() {
    final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester();
    final String readNameOne = "RUNID:1:1:15993:13361";
    final String readNameTwo = "RUNID:2:2:15993:13362";
    final String readNameThree = "RUNID:3:3:15993:13362";

    // first two reads have the same barcode, third read has a different barcode
    tester.addMatePair(
        readNameOne,
        2,
        41212324,
        41212310,
        false,
        false,
        false,
        false,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY);
    tester.addMatePair(
        readNameTwo,
        2,
        41212324,
        41212310,
        false,
        false,
        true,
        true,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY); // same barcode as the first
    tester.addMatePair(
        readNameThree,
        2,
        41212324,
        41212310,
        false,
        false,
        false,
        false,
        "33S35M",
        "19S49M",
        true,
        true,
        false,
        false,
        false,
        DEFAULT_BASE_QUALITY);

    final String barcodeTag = "BC";
    for (final SAMRecord record : new IterableAdapter<SAMRecord>(tester.getRecordIterator())) {
      if (record.getReadName().equals(readNameOne) || record.getReadName().equals(readNameTwo)) {
        record.setAttribute(barcodeTag, "Barcode1");
      } else if (record.getReadName().equals(readNameThree)) {
        record.setAttribute(barcodeTag, "Barcode2");
      }
    }
    tester.addArg("BARCODE_TAG=" + barcodeTag);
    tester.runTest();
  }
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }