Exemplo n.º 1
0
  private static ReadCountCollection createCoverageProfile(
      final PoN pon, final ReadCountCollection inputReadCounts) {
    Utils.nonNull(pon, "PoN cannot be null.");
    Utils.nonNull(
        inputReadCounts, "input read counts cannot be null when creating a coverage profile.");
    ParamUtils.isPositive(
        inputReadCounts.columnNames().size(),
        "inputReadCounts column names cannot be an empty list.");
    final Case2PoNTargetMapper targetMapper =
        new Case2PoNTargetMapper(inputReadCounts.targets(), pon.getTargetNames());
    final RealMatrix inputCounts = targetMapper.fromCaseToPoNCounts(inputReadCounts.counts());
    final RealMatrix targetNormalizedCounts = pon.factorNormalization(inputCounts);

    return targetMapper.fromPoNtoCaseCountCollection(
        targetNormalizedCounts, inputReadCounts.columnNames());
  }
Exemplo n.º 2
0
  /**
   * Project all of the normals used to create the PoN into the reduced panel.
   *
   * @param pon Not {@code null}.
   * @param ctx Spark context, {@code null} indicates no spark context available, which is
   *     supported.
   * @return never {@code null}. Result will contain multiple columns in each of the
   *     ReadCountCollection attributes.
   */
  public static TangentNormalizationResult tangentNormalizeNormalsInPoN(
      final PoN pon, final JavaSparkContext ctx) {
    // Get the list of sample names in a modifiable List
    final List<String> sampleNamesCopy = new ArrayList<>(pon.getSampleNames());

    logger.info("Loading normalized counts...");
    final ReadCountCollection coverageProfile =
        new ReadCountCollection(
            SetUniqueList.setUniqueList(pon.getTargets()),
            SetUniqueList.setUniqueList(sampleNamesCopy),
            pon.getNormalizedCounts());

    logger.info("Tangent normalizing the normals (normalized by target factors) ...");

    // For each sample in the PoN, tangent normalize against the qc reduced PoN.
    return TangentNormalizer.tangentNormalize(pon, coverageProfile, ctx);
  }
Exemplo n.º 3
0
  /**
   * Tangent normalize a coverage profile.
   *
   * <p>Notes about the Spark tangent normalization can be found in docs/PoN/
   *
   * @param pon Not {@code null}
   * @param targetFactorNormalizedCounts ReadCountCollection of counts that have already been
   *     normalized fully (typically, including the target factor normalization). I.e. a coverage
   *     profile The column names should be intact. Not {@code null} See {@link
   *     TangentNormalizer::createCoverageProfile}
   * @return never {@code null}
   */
  private static TangentNormalizationResult tangentNormalize(
      final PoN pon, final ReadCountCollection targetFactorNormalizedCounts, JavaSparkContext ctx) {

    Utils.nonNull(pon, "PoN cannot be null.");
    Utils.nonNull(targetFactorNormalizedCounts, "targetFactorNormalizedCounts cannot be null.");
    Utils.nonNull(
        targetFactorNormalizedCounts.columnNames(),
        "targetFactorNormalizedCounts column names cannot be null.");
    ParamUtils.isPositive(
        targetFactorNormalizedCounts.columnNames().size(),
        "targetFactorNormalizedCounts column names cannot be an empty list.");

    final Case2PoNTargetMapper targetMapper =
        new Case2PoNTargetMapper(targetFactorNormalizedCounts.targets(), pon.getPanelTargetNames());

    // The input counts with rows (targets) sorted so that they match the PoN's order.
    final RealMatrix tangentNormalizationRawInputCounts =
        targetMapper.fromCaseToPoNCounts(targetFactorNormalizedCounts.counts());

    // We prepare the counts for tangent normalization.
    final RealMatrix tangentNormalizationInputCounts =
        composeTangentNormalizationInputMatrix(tangentNormalizationRawInputCounts);

    if (ctx == null) {

      // Calculate the beta-hats for the input read count columns (samples).
      logger.info("Calculating beta hats...");
      final RealMatrix tangentBetaHats =
          pon.betaHats(tangentNormalizationInputCounts, true, EPSILON);

      // Actual tangent normalization step.
      logger.info(
          "Performing actual tangent normalization ("
              + tangentNormalizationInputCounts.getColumnDimension()
              + " columns)...");
      final RealMatrix tangentNormalizedCounts =
          pon.tangentNormalization(tangentNormalizationInputCounts, tangentBetaHats, true);

      // Output the tangent normalized counts.
      logger.info("Post-processing tangent normalization results...");
      final ReadCountCollection tangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
      final ReadCountCollection preTangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());

      return new TangentNormalizationResult(
          tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts);

    } else {

      /*
      Using Spark:  the code here is a little more complex for optimization purposes.

      Please see notes in docs/PoN ...

      Ahat^T = (C^T P^T) A^T
      Therefore, C^T is the RowMatrix

      pinv: P
      panel: A
      projection: Ahat
      cases: C
      betahat: C^T P^T
      tangentNormalizedCounts: C - Ahat
       */
      final RealMatrix pinv = pon.getReducedPanelPInverseCounts();
      final RealMatrix panel = pon.getReducedPanelCounts();

      // Make the C^T a distributed matrix (RowMatrix)
      final RowMatrix caseTDistMat =
          SparkConverter.convertRealMatrixToSparkRowMatrix(
              ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK);

      // Spark local matrices (transposed)
      final Matrix pinvTLocalMat =
          new DenseMatrix(
                  pinv.getRowDimension(),
                  pinv.getColumnDimension(),
                  Doubles.concat(pinv.getData()),
                  true)
              .transpose();
      final Matrix panelTLocalMat =
          new DenseMatrix(
                  panel.getRowDimension(),
                  panel.getColumnDimension(),
                  Doubles.concat(panel.getData()),
                  true)
              .transpose();

      // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons
      // matrix (not transposed)
      final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat);
      final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat);
      final RealMatrix projection =
          SparkConverter.convertSparkRowMatrixToRealMatrix(
                  projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension())
              .transpose();

      // Subtract the cases from the projection
      final RealMatrix tangentNormalizedCounts =
          tangentNormalizationInputCounts.subtract(projection);

      // Construct the result object and return it with the correct targets.
      final ReadCountCollection tangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames());
      final ReadCountCollection preTangentNormalized =
          targetMapper.fromPoNtoCaseCountCollection(
              tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames());
      final RealMatrix tangentBetaHats =
          SparkConverter.convertSparkRowMatrixToRealMatrix(
              betahatDistMat, tangentNormalizedCounts.getColumnDimension());
      return new TangentNormalizationResult(
          tangentNormalized,
          preTangentNormalized,
          tangentBetaHats.transpose(),
          targetFactorNormalizedCounts);
    }
  }