public static void main(String[] args) throws IOException { SparkConf config = new SparkConf().setAppName("003-distributed-matrices").setMaster("local[*]"); try (JavaSparkContext sc = new JavaSparkContext(config)) { /* Create a RowMatrix */ List<Vector> vectors = new ArrayList<>(10); for (int i = 0; i < 10; i++) { vectors.add(Vectors.dense(getVectorElements())); } JavaRDD<Vector> rowsRDD = sc.parallelize(vectors, 4); RowMatrix rowMatrix = new RowMatrix(rowsRDD.rdd()); System.out.println(rowMatrix.toString()); /* Create an IndexedRowMatrix */ JavaRDD<IndexedRow> indexedRows = sc.parallelize( Arrays.asList(new IndexedRow(0, vectors.get(0)), new IndexedRow(1, vectors.get(1)))); IndexedRowMatrix indexedRowMatrix = new IndexedRowMatrix(indexedRows.rdd()); System.out.println(indexedRowMatrix); /* convert */ JavaRDD<IndexedRow> indexedRowsFromRowMatrix = rowMatrix .rows() .toJavaRDD() .zipWithIndex() .map((Tuple2<Vector, Long> t) -> new IndexedRow(t._2(), t._1())); IndexedRowMatrix indexedRowMatrixFromRowMatrix = new IndexedRowMatrix(indexedRowsFromRowMatrix.rdd()); System.out.println(indexedRowMatrixFromRowMatrix); /* Create a CoordinateMatrix * M = [ 5 0 1 * 0 3 4 ] */ JavaRDD<MatrixEntry> matrixEntries = sc.parallelize( Arrays.asList( new MatrixEntry(0, 0, 5.), new MatrixEntry(1, 1, 3.), new MatrixEntry(2, 0, 1.), new MatrixEntry(2, 1, 4.))); CoordinateMatrix coordMatrix = new CoordinateMatrix(matrixEntries.rdd()); System.out.println(coordMatrix); printSeparator(); } }
/** * Tangent normalize a coverage profile. * * <p>Notes about the Spark tangent normalization can be found in docs/PoN/ * * @param pon Not {@code null} * @param targetFactorNormalizedCounts ReadCountCollection of counts that have already been * normalized fully (typically, including the target factor normalization). I.e. a coverage * profile The column names should be intact. Not {@code null} See {@link * TangentNormalizer::createCoverageProfile} * @return never {@code null} */ private static TangentNormalizationResult tangentNormalize( final PoN pon, final ReadCountCollection targetFactorNormalizedCounts, JavaSparkContext ctx) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull(targetFactorNormalizedCounts, "targetFactorNormalizedCounts cannot be null."); Utils.nonNull( targetFactorNormalizedCounts.columnNames(), "targetFactorNormalizedCounts column names cannot be null."); ParamUtils.isPositive( targetFactorNormalizedCounts.columnNames().size(), "targetFactorNormalizedCounts column names cannot be an empty list."); final Case2PoNTargetMapper targetMapper = new Case2PoNTargetMapper(targetFactorNormalizedCounts.targets(), pon.getPanelTargetNames()); // The input counts with rows (targets) sorted so that they match the PoN's order. final RealMatrix tangentNormalizationRawInputCounts = targetMapper.fromCaseToPoNCounts(targetFactorNormalizedCounts.counts()); // We prepare the counts for tangent normalization. final RealMatrix tangentNormalizationInputCounts = composeTangentNormalizationInputMatrix(tangentNormalizationRawInputCounts); if (ctx == null) { // Calculate the beta-hats for the input read count columns (samples). logger.info("Calculating beta hats..."); final RealMatrix tangentBetaHats = pon.betaHats(tangentNormalizationInputCounts, true, EPSILON); // Actual tangent normalization step. logger.info( "Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)..."); final RealMatrix tangentNormalizedCounts = pon.tangentNormalization(tangentNormalizationInputCounts, tangentBetaHats, true); // Output the tangent normalized counts. logger.info("Post-processing tangent normalization results..."); final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts); } else { /* Using Spark: the code here is a little more complex for optimization purposes. Please see notes in docs/PoN ... Ahat^T = (C^T P^T) A^T Therefore, C^T is the RowMatrix pinv: P panel: A projection: Ahat cases: C betahat: C^T P^T tangentNormalizedCounts: C - Ahat */ final RealMatrix pinv = pon.getReducedPanelPInverseCounts(); final RealMatrix panel = pon.getReducedPanelCounts(); // Make the C^T a distributed matrix (RowMatrix) final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix( ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK); // Spark local matrices (transposed) final Matrix pinvTLocalMat = new DenseMatrix( pinv.getRowDimension(), pinv.getColumnDimension(), Doubles.concat(pinv.getData()), true) .transpose(); final Matrix panelTLocalMat = new DenseMatrix( panel.getRowDimension(), panel.getColumnDimension(), Doubles.concat(panel.getData()), true) .transpose(); // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons // matrix (not transposed) final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat); final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat); final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix( projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()) .transpose(); // Subtract the cases from the projection final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection); // Construct the result object and return it with the correct targets. final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix( betahatDistMat, tangentNormalizedCounts.getColumnDimension()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts); } }