private static ReadCountCollection createCoverageProfile( final PoN pon, final ReadCountCollection inputReadCounts) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull( inputReadCounts, "input read counts cannot be null when creating a coverage profile."); ParamUtils.isPositive( inputReadCounts.columnNames().size(), "inputReadCounts column names cannot be an empty list."); final Case2PoNTargetMapper targetMapper = new Case2PoNTargetMapper(inputReadCounts.targets(), pon.getTargetNames()); final RealMatrix inputCounts = targetMapper.fromCaseToPoNCounts(inputReadCounts.counts()); final RealMatrix targetNormalizedCounts = pon.factorNormalization(inputCounts); return targetMapper.fromPoNtoCaseCountCollection( targetNormalizedCounts, inputReadCounts.columnNames()); }
/** * Project all of the normals used to create the PoN into the reduced panel. * * @param pon Not {@code null}. * @param ctx Spark context, {@code null} indicates no spark context available, which is * supported. * @return never {@code null}. Result will contain multiple columns in each of the * ReadCountCollection attributes. */ public static TangentNormalizationResult tangentNormalizeNormalsInPoN( final PoN pon, final JavaSparkContext ctx) { // Get the list of sample names in a modifiable List final List<String> sampleNamesCopy = new ArrayList<>(pon.getSampleNames()); logger.info("Loading normalized counts..."); final ReadCountCollection coverageProfile = new ReadCountCollection( SetUniqueList.setUniqueList(pon.getTargets()), SetUniqueList.setUniqueList(sampleNamesCopy), pon.getNormalizedCounts()); logger.info("Tangent normalizing the normals (normalized by target factors) ..."); // For each sample in the PoN, tangent normalize against the qc reduced PoN. return TangentNormalizer.tangentNormalize(pon, coverageProfile, ctx); }
/** * Tangent normalize a coverage profile. * * <p>Notes about the Spark tangent normalization can be found in docs/PoN/ * * @param pon Not {@code null} * @param targetFactorNormalizedCounts ReadCountCollection of counts that have already been * normalized fully (typically, including the target factor normalization). I.e. a coverage * profile The column names should be intact. Not {@code null} See {@link * TangentNormalizer::createCoverageProfile} * @return never {@code null} */ private static TangentNormalizationResult tangentNormalize( final PoN pon, final ReadCountCollection targetFactorNormalizedCounts, JavaSparkContext ctx) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull(targetFactorNormalizedCounts, "targetFactorNormalizedCounts cannot be null."); Utils.nonNull( targetFactorNormalizedCounts.columnNames(), "targetFactorNormalizedCounts column names cannot be null."); ParamUtils.isPositive( targetFactorNormalizedCounts.columnNames().size(), "targetFactorNormalizedCounts column names cannot be an empty list."); final Case2PoNTargetMapper targetMapper = new Case2PoNTargetMapper(targetFactorNormalizedCounts.targets(), pon.getPanelTargetNames()); // The input counts with rows (targets) sorted so that they match the PoN's order. final RealMatrix tangentNormalizationRawInputCounts = targetMapper.fromCaseToPoNCounts(targetFactorNormalizedCounts.counts()); // We prepare the counts for tangent normalization. final RealMatrix tangentNormalizationInputCounts = composeTangentNormalizationInputMatrix(tangentNormalizationRawInputCounts); if (ctx == null) { // Calculate the beta-hats for the input read count columns (samples). logger.info("Calculating beta hats..."); final RealMatrix tangentBetaHats = pon.betaHats(tangentNormalizationInputCounts, true, EPSILON); // Actual tangent normalization step. logger.info( "Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)..."); final RealMatrix tangentNormalizedCounts = pon.tangentNormalization(tangentNormalizationInputCounts, tangentBetaHats, true); // Output the tangent normalized counts. logger.info("Post-processing tangent normalization results..."); final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts); } else { /* Using Spark: the code here is a little more complex for optimization purposes. Please see notes in docs/PoN ... Ahat^T = (C^T P^T) A^T Therefore, C^T is the RowMatrix pinv: P panel: A projection: Ahat cases: C betahat: C^T P^T tangentNormalizedCounts: C - Ahat */ final RealMatrix pinv = pon.getReducedPanelPInverseCounts(); final RealMatrix panel = pon.getReducedPanelCounts(); // Make the C^T a distributed matrix (RowMatrix) final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix( ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK); // Spark local matrices (transposed) final Matrix pinvTLocalMat = new DenseMatrix( pinv.getRowDimension(), pinv.getColumnDimension(), Doubles.concat(pinv.getData()), true) .transpose(); final Matrix panelTLocalMat = new DenseMatrix( panel.getRowDimension(), panel.getColumnDimension(), Doubles.concat(panel.getData()), true) .transpose(); // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons // matrix (not transposed) final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat); final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat); final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix( projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()) .transpose(); // Subtract the cases from the projection final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection); // Construct the result object and return it with the correct targets. final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix( betahatDistMat, tangentNormalizedCounts.getColumnDimension()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts); } }