Beispiel #1
0
  /**
   * Calculates the reference coordinate for the end of the read taking into account soft clips but
   * not hard clips.
   *
   * <p>Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
   *
   * @param read the read
   * @param cigar the read's cigar
   *     <p>Note: this overload of the function takes the cigar as input for speed because getCigar
   *     is an expensive operation. Most callers should use the overload that does not take the
   *     cigar.
   * @return the unclipped end of the read taking soft clips (but not hard clips) into account
   */
  public static int getSoftEnd(final GATKRead read, final Cigar cigar) {
    Utils.nonNull(read, "read");
    Utils.nonNull(cigar, "cigar");

    boolean foundAlignedBase = false;
    int softEnd = read.getEnd();
    final List<CigarElement> cigs = cigar.getCigarElements();
    for (int i = cigs.size() - 1; i >= 0; --i) {
      final CigarElement cig = cigs.get(i);
      final CigarOperator op = cig.getOperator();

      if (op
          == CigarOperator
              .SOFT_CLIP) { // assumes the soft clip that we found is at the end of the aligned read
        softEnd += cig.getLength();
      } else if (op != CigarOperator.HARD_CLIP) {
        foundAlignedBase = true;
        break;
      }
    }
    if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the
                             // alignment end
      softEnd = read.getEnd();
    }
    return softEnd;
  }
Beispiel #2
0
  /**
   * Combines the right table into the left table, in-place (without making a copy)
   *
   * @param left first table to combine
   * @param right second table to combine
   * @return modified version of left with the contents of right incorporated into it
   */
  public static RecalibrationTables inPlaceCombine(
      final RecalibrationTables left, final RecalibrationTables right) {
    Utils.nonNull(left);
    Utils.nonNull(right);

    return left.combine(right);
  }
 /**
  * Do the full tangent normalization process given proportional coverage data.
  *
  * <p>This includes:
  *
  * <ul>
  *   <li>normalization by target factors
  *   <li>projection of the normalized coverage profile into the hyperplane from the PoN
  * </ul>
  *
  * @param pon -- never {@code null}
  * @param pcov -- never {@code null}. Must contain data for at least one sample.
  * @param ctx spark context. Use {@code null} if no context is available
  * @return never {@code null}
  */
 public static TangentNormalizationResult tangentNormalizePcov(
     final PoN pon, final ReadCountCollection pcov, final JavaSparkContext ctx) {
   Utils.nonNull(pon, "PoN cannot be null.");
   Utils.nonNull(pcov, "input pcov read counts cannot be null when creating a coverage profile.");
   ParamUtils.isPositive(
       pcov.columnNames().size(), "input cov profile column names cannot be an empty list.");
   final ReadCountCollection coverageProfile = createCoverageProfile(pon, pcov);
   return TangentNormalizer.tangentNormalize(pon, coverageProfile, ctx);
 }
Beispiel #4
0
  /**
   * Set the base qualities from a string of ASCII encoded values
   *
   * @param read read whose base qualities should be set
   * @param baseQualityString ASCII encoded (encoded as a FASTQ string) values of base qualities.
   */
  public static void setBaseQualityString(final GATKRead read, final String baseQualityString) {
    Utils.nonNull(read);
    Utils.nonNull(baseQualityString);

    if (SAMRecord.NULL_QUALS_STRING.equals(baseQualityString)) {
      read.setBaseQualities(SAMRecord.NULL_QUALS);
    } else {
      read.setBaseQualities(SAMUtils.fastqToPhred(baseQualityString));
    }
  }
 /**
  * Creates a new target-out-info enum value given the composer function reference a the list of
  * output column names.
  *
  * @param composer the composer lambda reference.
  * @param headerNames list of info column names in the order these are going to be lay out in
  *     output files.
  * @throws IllegalArgumentException if {@code composer} or {@code headerNames} are {@code null},
  *     or {@code headerNames} contains a {@code null}.
  */
 TargetOutInfo(final Composer composer, final String... headerNames) {
   this.composer = Utils.nonNull(composer, "the info string composer cannot be null");
   this.headerNames =
       Collections.unmodifiableList(
           Arrays.asList(
               Utils.nonNull(headerNames, "the header name list provided cannot be null")));
   if (this.headerNames.stream().anyMatch(Objects::isNull)) {
     throw new IllegalArgumentException("the input header-name cannot contain nulls");
   }
 }
Beispiel #6
0
  /**
   * Combines the two tables into a new table (allocating a new table in the process)
   *
   * @param left first table to combine
   * @param right second table to combine
   * @return a new table with the merged contents of left and right
   */
  public static RecalibrationTables safeCombine(
      final RecalibrationTables left, final RecalibrationTables right) {
    Utils.nonNull(left);
    Utils.nonNull(right);

    final RecalibrationTables newTable =
        new RecalibrationTables(left.covariates, left.numReadGroups);
    newTable.combine(left);
    newTable.combine(right);
    return newTable;
  }
Beispiel #7
0
  /**
   * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the
   * factory has been set to create an index, the header must be set to coordinate sorted.
   *
   * @param outputFile if this file has a .cram extension then a reference is required. Can not be
   *     null.
   * @param referenceFile the reference source to use. Can not be null if a output file has a .cram
   *     extension.
   * @param header header to be used for the output writer
   * @param preSorted if true then records must already be sorted to match the header sort order
   * @param factory SAMFileWriterFactory factory to use
   * @return SAMFileWriter
   */
  public static SAMFileWriter createCommonSAMWriterFromFactory(
      final SAMFileWriterFactory factory,
      final File outputFile,
      final File referenceFile,
      final SAMFileHeader header,
      final boolean preSorted) {
    Utils.nonNull(outputFile);
    Utils.nonNull(header);

    if (null == referenceFile && outputFile.getName().endsWith(CramIO.CRAM_FILE_EXTENSION)) {
      throw new UserException("A reference file is required for writing CRAM files");
    }

    return factory.makeWriter(header.clone(), preSorted, outputFile, referenceFile);
  }
Beispiel #8
0
  /**
   * Calculates the reference coordinate for the beginning of the read taking into account soft
   * clips but not hard clips.
   *
   * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
   *
   * @param read the read
   * @param cigar the read's cigar
   *     <p>Note: this overload of the function takes the cigar as input for speed because getCigar
   *     is an expensive operation. Most callers should use the overload that does not take the
   *     cigar.
   * @return the unclipped start of the read taking soft clips (but not hard clips) into account
   */
  public static int getSoftStart(final GATKRead read, final Cigar cigar) {
    Utils.nonNull(read, "read");
    Utils.nonNull(cigar, "cigar");

    int softStart = read.getStart();
    for (final CigarElement cig : cigar.getCigarElements()) {
      final CigarOperator op = cig.getOperator();

      if (op == CigarOperator.SOFT_CLIP) {
        softStart -= cig.getLength();
      } else if (op != CigarOperator.HARD_CLIP) {
        break;
      }
    }
    return softStart;
  }
Beispiel #9
0
 /**
  * Returns the base qualities for the read as a string.
  *
  * @param read read whose base qualities should be returned
  * @return Base qualities string as printable ASCII values (encoded as a FASTQ string).
  */
 public static String getBaseQualityString(final GATKRead read) {
   Utils.nonNull(read);
   if (Arrays.equals(SAMRecord.NULL_QUALS, read.getBaseQualities())) {
     return SAMRecord.NULL_QUALS_STRING;
   }
   return SAMUtils.phredToFastq(read.getBaseQualities());
 }
  private static ReadCountCollection createCoverageProfile(
      final PoN pon, final ReadCountCollection inputReadCounts) {
    Utils.nonNull(pon, "PoN cannot be null.");
    Utils.nonNull(
        inputReadCounts, "input read counts cannot be null when creating a coverage profile.");
    ParamUtils.isPositive(
        inputReadCounts.columnNames().size(),
        "inputReadCounts column names cannot be an empty list.");
    final Case2PoNTargetMapper targetMapper =
        new Case2PoNTargetMapper(inputReadCounts.targets(), pon.getTargetNames());
    final RealMatrix inputCounts = targetMapper.fromCaseToPoNCounts(inputReadCounts.counts());
    final RealMatrix targetNormalizedCounts = pon.factorNormalization(inputCounts);

    return targetMapper.fromPoNtoCaseCountCollection(
        targetNormalizedCounts, inputReadCounts.columnNames());
  }
  @Override
  protected AFCalculationResult computeLog10PNonRef(
      final VariantContext vc,
      final int defaultPloidy,
      final double[] log10AlleleFrequencyPriors,
      final StateTracker stateTracker) {
    Utils.nonNull(vc, "vc is null");
    Utils.nonNull(log10AlleleFrequencyPriors, "log10AlleleFrequencyPriors is null");
    Utils.nonNull(stateTracker, "stateTracker is null");
    final int numAlternateAlleles = vc.getNAlleles() - 1;

    final List<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
    final int numSamples = genotypeLikelihoods.size() - 1;
    final int numChr = 2 * numSamples;

    // queue of AC conformations to process
    final Deque<ExactACset> ACqueue = new LinkedList<>();

    // mapping of ExactACset indexes to the objects
    final Map<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr + 1);

    // add AC=0 to the queue
    final int[] zeroCounts = new int[numAlternateAlleles];
    final ExactACset zeroSet = new ExactACset(numSamples + 1, new ExactACcounts(zeroCounts));
    ACqueue.add(zeroSet);
    indexesToACset.put(zeroSet.getACcounts(), zeroSet);

    while (!ACqueue.isEmpty()) {

      // compute log10Likelihoods
      final ExactACset set = ACqueue.remove();

      calculateAlleleCountConformation(
          set,
          genotypeLikelihoods,
          numChr,
          ACqueue,
          indexesToACset,
          log10AlleleFrequencyPriors,
          stateTracker);

      // clean up memory
      indexesToACset.remove(set.getACcounts());
    }

    return getResultFromFinalState(vc, log10AlleleFrequencyPriors, stateTracker);
  }
  /**
   * Returns the value that corresponds to a given implementation calculator class.
   *
   * @param clazz the target class.
   * @throws IllegalArgumentException if {@code clazz} is {@code null} or if it is abstract.
   * @return never {@code null}.
   */
  public static AFCalculatorImplementation fromCalculatorClass(
      final Class<? extends AFCalculator> clazz) {
    Utils.nonNull(clazz, "input class cannot be null");
    Utils.validateArg(
        !Modifier.isAbstract(clazz.getModifiers()),
        "class " + clazz.getCanonicalName() + " should not be abstract");

    // Using iteration instead of a static map to avoid static state.
    for (final AFCalculatorImplementation impl : AFCalculatorImplementation.values()) {
      if (clazz.equals(impl.newInstance().getClass())) {
        return impl;
      }
    }
    throw new IllegalArgumentException(
        "Attempt to retrieve AFCalculatorImplementation instance from a non-registered calculator class "
            + clazz.getName());
  }
 /**
  * Constructs the target collection from an target-file passed by the user.
  *
  * @return never {@code null}.
  */
 private TargetCollection<Target> resolveTargetsFromFile() {
   Utils.regularReadableUserFile(targetsFile);
   logger.log(
       Level.INFO,
       String.format(
           "Reading target intervals from targets file '%s' ...", targetsFile.getAbsolutePath()));
   final List<Target> targets = TargetTableReader.readTargetFile(targetsFile);
   return new HashedListTargetCollection<>(targets);
 }
 /**
  * Constructs a new instance given all its properties
  *
  * @param afCalculatorSupplier the calculator class that realizes this implementation.
  * @param requiredPloidy the required ploidy; zero or greater or {@link #UNBOUND_PLOIDY} to
  *     indicate that any ploidy is supported.
  * @param maxAltAlleles the maximum alternative alleles; zero or greater or {@link
  *     #UNBOUND_ALTERNATIVE_ALLELE_COUNT} to indicate that any maximum number of alternative
  *     alleles is supported.
  */
 AFCalculatorImplementation(
     final Supplier<AFCalculator> afCalculatorSupplier,
     final int requiredPloidy,
     final int maxAltAlleles) {
   Utils.nonNull(afCalculatorSupplier);
   this.afCalculatorSupplier = afCalculatorSupplier;
   this.requiredPloidy = requiredPloidy;
   this.maxAltAlleles = maxAltAlleles;
 }
Beispiel #15
0
  /**
   * Create a common SAMFileWriter for use with GATK tools.
   *
   * @param outputFile - if this file has a .cram extension then a reference is required. Can not be
   *     null.
   * @param referenceFile - the reference source to use. Can not be null if a output file has a
   *     .cram extension.
   * @param header - header to be used for the output writer
   * @param preSorted - if true then the records must already be sorted to match the header sort
   *     order
   * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files
   * @param createMD5 - if true an MD5 file will be created
   * @return SAMFileWriter
   */
  public static SAMFileWriter createCommonSAMWriter(
      final File outputFile,
      final File referenceFile,
      final SAMFileHeader header,
      final boolean preSorted,
      boolean createOutputBamIndex,
      final boolean createMD5) {
    Utils.nonNull(outputFile);
    Utils.nonNull(header);

    if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
      logger.warn(
          "Skipping index file creation for: "
              + outputFile.getAbsolutePath()
              + ". Index file creation requires reads in coordinate sorted order.");
      createOutputBamIndex = false;
    }

    final SAMFileWriterFactory factory =
        new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5);
    return ReadUtils.createCommonSAMWriterFromFactory(
        factory, outputFile, referenceFile, header, preSorted);
  }
 /**
  * Merges the argument into this histogram generator. Returns the modified 'this' object. Note:
  * you can only merge HistogramGenerator is they have the same 'useOriginalQualities' value.
  */
 public HistogramGenerator merge(final HistogramGenerator hg2) {
   Utils.nonNull(hg2);
   if (this.useOriginalQualities != hg2.useOriginalQualities) {
     throw new IllegalArgumentException(
         "unequal useOriginalQualities. This has " + this.useOriginalQualities);
   }
   ensureArraysBigEnough(hg2.maxLengthSoFar);
   for (int i = 0; i < hg2.firstReadTotalsByCycle.length; i++) {
     this.firstReadTotalsByCycle[i] += hg2.firstReadTotalsByCycle[i];
   }
   for (int i = 0; i < hg2.secondReadTotalsByCycle.length; i++) {
     this.secondReadTotalsByCycle[i] += hg2.secondReadTotalsByCycle[i];
   }
   for (int i = 0; i < hg2.firstReadCountsByCycle.length; i++) {
     this.firstReadCountsByCycle[i] += hg2.firstReadCountsByCycle[i];
   }
   for (int i = 0; i < hg2.secondReadCountsByCycle.length; i++) {
     this.secondReadCountsByCycle[i] += hg2.secondReadCountsByCycle[i];
   }
   return this;
 }
 public Pulldown(final SAMFileHeader header) {
   this.header = Utils.nonNull(header, "SAMFileHeader must be supplied.");
   ;
 }
Beispiel #18
0
 /**
  * Calculates the reference coordinate for the beginning of the read taking into account soft
  * clips but not hard clips.
  *
  * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
  *
  * @return the unclipped start of the read taking soft clips (but not hard clips) into account
  */
 public static int getSoftStart(final GATKRead read) {
   Utils.nonNull(read);
   return getSoftStart(read, read.getCigar());
 }
 /**
  * Transforms and composes the string representation of an individual count.
  *
  * <p>The output string must be fully formatted human friendly representation of the transformed
  * value.
  *
  * @param count the individual count value.
  * @param columnTotal the corresponding column total sum.
  * @return never {@code null}.
  * @throws IllegalArgumentException if {@code count} is less than 0 or greater than {@code
  *     columnTotal}.
  */
 protected String apply(final int count, final long columnTotal) {
   ParamUtils.isPositiveOrZero(count, "the count cannot less than 0");
   Utils.validateArg(count <= columnTotal, "the count cannot be larger than the column total");
   return operator.apply(count, columnTotal);
 }
 /**
  * Composes the target information output string.
  *
  * @param index of the target in the collection.
  * @param collection the target containing collection.
  * @throws IllegalArgumentException if either {@code target} or {@code collection} is {@code
  *     null}.
  */
 protected String composeTargetOutInfoString(
     final int index, final TargetCollection<Target> collection) {
   Utils.nonNull(collection, "the collection cannot be null");
   Utils.validIndex(index, collection.targetCount());
   return composer.apply(index, collection);
 }
 /**
  * Constructor that reads (sequence, position, reference count, alternate count) from the
  * specified file and uses external SAMFile header to construct Pulldown.
  *
  * @param inputFile file to read from
  * @param header SAMFile header for IntervalList
  */
 public Pulldown(final File inputFile, final SAMFileHeader header) {
   super(inputFile);
   this.header = Utils.nonNull(header, "SAMFileHeader must be supplied.");
 }
 public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes) {
   Utils.nonNull(includeTypes);
   sampleTypes = includeTypes;
 }
  /**
   * Tangent normalize a coverage profile.
   *
   * <p>Notes about the Spark tangent normalization can be found in docs/PoN/
   *
   * @param pon Not {@code null}
   * @param targetFactorNormalizedCounts ReadCountCollection of counts that have already been
   *     normalized fully (typically, including the target factor normalization). I.e. a coverage
   *     profile The column names should be intact. Not {@code null} See {@link
   *     TangentNormalizer::createCoverageProfile}
   * @return never {@code null}
   */
  private static TangentNormalizationResult tangentNormalize(
      final PoN pon, final ReadCountCollection targetFactorNormalizedCounts, JavaSparkContext ctx) {

    Utils.nonNull(pon, "PoN cannot be null.");
    Utils.nonNull(targetFactorNormalizedCounts, "targetFactorNormalizedCounts cannot be null.");
    Utils.nonNull(
        targetFactorNormalizedCounts.columnNames(),
        "targetFactorNormalizedCounts column names cannot be null.");
    ParamUtils.isPositive(
        targetFactorNormalizedCounts.columnNames().size(),
        "targetFactorNormalizedCounts column names cannot be an empty list.");

    final Case2PoNTargetMapper targetMapper =
        new Case2PoNTargetMapper(targetFactorNormalizedCounts.targets(), pon.getPanelTargetNames());

    // The input counts with rows (targets) sorted so that they match the PoN's order.
    final RealMatrix tangentNormalizationRawInputCounts =
        targetMapper.fromCaseToPoNCounts(targetFactorNormalizedCounts.counts());

    // We prepare the counts for tangent normalization.
    final RealMatrix tangentNormalizationInputCounts =
        composeTangentNormalizationInputMatrix(tangentNormalizationRawInputCounts);

    if (ctx == null) {

      // Calculate the beta-hats for the input read count columns (samples).
      logger.info("Calculating beta hats...");
      final RealMatrix tangentBetaHats =
          pon.betaHats(tangentNormalizationInputCounts, true, EPSILON);

      // Actual tangent normalization step.
      logger.info(
          "Performing actual tangent normalization ("
              + tangentNormalizationInputCounts.getColumnDimension()
              + " columns)...");
      final RealMatrix tangentNormalizedCounts =
          pon.tangentNormalization(tangentNormalizationInputCounts, tangentBetaHats, true);

      // Output the tangent normalized counts.
      logger.info("Post-processing tangent normalization results...");
      final ReadCountCollection tangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
      final ReadCountCollection preTangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());

      return new TangentNormalizationResult(
          tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts);

    } else {

      /*
      Using Spark:  the code here is a little more complex for optimization purposes.

      Please see notes in docs/PoN ...

      Ahat^T = (C^T P^T) A^T
      Therefore, C^T is the RowMatrix

      pinv: P
      panel: A
      projection: Ahat
      cases: C
      betahat: C^T P^T
      tangentNormalizedCounts: C - Ahat
       */
      final RealMatrix pinv = pon.getReducedPanelPInverseCounts();
      final RealMatrix panel = pon.getReducedPanelCounts();

      // Make the C^T a distributed matrix (RowMatrix)
      final RowMatrix caseTDistMat =
          SparkConverter.convertRealMatrixToSparkRowMatrix(
              ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);

      // Spark local matrices (transposed)
      final Matrix pinvTLocalMat =
          new DenseMatrix(
                  pinv.getRowDimension(),
                  pinv.getColumnDimension(),
                  Doubles.concat(pinv.getData()),
                  true)
              .transpose();
      final Matrix panelTLocalMat =
          new DenseMatrix(
                  panel.getRowDimension(),
                  panel.getColumnDimension(),
                  Doubles.concat(panel.getData()),
                  true)
              .transpose();

      // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons
      // matrix (not transposed)
      final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
      final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
      final RealMatrix projection =
          SparkConverter.convertSparkRowMatrixToRealMatrix(
                  projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension())
              .transpose();

      // Subtract the cases from the projection
      final RealMatrix tangentNormalizedCounts =
          tangentNormalizationInputCounts.subtract(projection);

      // Construct the result object and return it with the correct targets.
      final ReadCountCollection tangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
      final ReadCountCollection preTangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
      final RealMatrix tangentBetaHats =
          SparkConverter.convertSparkRowMatrixToRealMatrix(
              betahatDistMat, tangentNormalizedCounts.getColumnDimension());
      return new TangentNormalizationResult(
          tangentNormalized,
          preTangentNormalized,
          tangentBetaHats.transpose(),
          targetFactorNormalizedCounts);
    }
  }