@Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); Path cMemMatrixPath = new Path(conf.get(RECONSTRUCTIONMATRIX)); Path dMemMatrixPath = new Path(conf.get(MATRIXY2X)); Path zmPath = new Path(conf.get(ZMPATH)); Path meanPath = new Path(conf.get(YMPATH)); int inMemMatrixNumRows = conf.getInt(YCOLS, 0); int inMemMatrixNumCols = conf.getInt(XCOLS, 0); ERR_SAMPLE_RATE = conf.getFloat(ERRSAMPLERATE, 1); Path tmpPath = cMemMatrixPath.getParent(); DistributedRowMatrix distMatrix = new DistributedRowMatrix(cMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols); distMatrix.setConf(conf); matrixC = PCACommon.toDenseMatrix(distMatrix); distMatrix = new DistributedRowMatrix(dMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols); distMatrix.setConf(conf); matrixY2X = PCACommon.toDenseMatrix(distMatrix); try { zm = PCACommon.toDenseVector(zmPath, conf); ym = PCACommon.toDenseVector(meanPath, conf); } catch (IOException e) { e.printStackTrace(); } xiCt = new DenseVector(matrixC.numRows()); sumOfErr = new DenseVector(matrixC.numRows()); sumOfyi = new DenseVector(matrixC.numRows()); sumOfyc = new DenseVector(matrixC.numRows()); }
/** * Progammatic invocation of run() * * @param eigenInput Output of LanczosSolver * @param corpusInput Input of LanczosSolver */ public void runJob( Configuration conf, Path eigenInput, Path corpusInput, Path output, boolean inMemory, double maxError, int maxEigens) throws IOException { // no need to handle command line arguments outPath = output; tmpOut = new Path(outPath, "tmp"); maxEigensToKeep = maxEigens; this.maxError = maxError; if (eigenInput != null && eigensToVerify == null) { prepareEigens(new Configuration(conf), eigenInput, inMemory); } DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1); c.setConf(new Configuration(conf)); corpus = c; eigenVerifier = new SimpleEigenVerifier(); Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens(); List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData); saveCleanEigens(conf, prunedEigenMeta); }
/** * Run the solver to produce the raw eigenvectors * * @param inputPath the Path to the input corpus * @param outputPath the Path to the output * @param outputTmpPath a Path to a temporary working directory * @param numRows the int number of rows * @param numCols the int number of columns * @param isSymmetric true if the input matrix is symmetric * @param desiredRank the int desired rank of eigenvectors to produce * @return an int indicating success (0) or otherwise */ public int run( Path inputPath, Path outputPath, Path outputTmpPath, Path workingDirPath, int numRows, int numCols, boolean isSymmetric, int desiredRank) throws Exception { DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols); matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration())); LanczosState state; if (workingDirPath == null) { state = new LanczosState(matrix, desiredRank, getInitialVector(matrix)); } else { HdfsBackedLanczosState hState = new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath); hState.setConf(matrix.getConf()); state = hState; } solve(state, desiredRank, isSymmetric); Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS); serializeOutput(state, outputEigenVectorPath); return 0; }
@Test public void testSolver() throws Exception { Configuration conf = getConfiguration(); Path testData = getTestTempDirPath("testdata"); DistributedRowMatrix matrix = new TestDistributedRowMatrix() .randomDistributedMatrix(10, 10, 10, 10, 10.0, true, testData.toString()); matrix.setConf(conf); Path output = getTestTempFilePath("output"); Path vectorPath = getTestTempFilePath("vector"); Path tempPath = getTestTempDirPath("tmp"); Vector vector = randomVector(matrix.numCols(), 10.0); saveVector(conf, vectorPath, vector); String[] args = { "-i", matrix.getRowPath().toString(), "-o", output.toString(), "--tempDir", tempPath.toString(), "--vector", vectorPath.toString(), "--numRows", "10", "--numCols", "10", "--symmetric", "true" }; DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver(); ToolRunner.run(getConfiguration(), solver.job(), args); Vector x = loadVector(conf, output); Vector solvedVector = matrix.times(x); double distance = Math.sqrt(vector.getDistanceSquared(solvedVector)); assertEquals(0.0, distance, EPSILON); }
/** * Progammatic invocation of run() * * @param eigenInput Output of LanczosSolver * @param corpusInput Input of LanczosSolver */ public void runJob( Configuration conf, Path eigenInput, Path corpusInput, Path output, boolean inMemory, double maxError, double minEigenValue, int maxEigens) throws IOException { // no need to handle command line arguments outPath = output; tmpOut = new Path(outPath, "tmp"); maxEigensToKeep = maxEigens; this.maxError = maxError; if (eigenInput != null && eigensToVerify == null) { prepareEigens(new Configuration(conf), eigenInput, inMemory); } DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1); c.setConf(new Configuration(conf)); corpus = c; eigenVerifier = new SimpleEigenVerifier(); // OrthonormalityVerifier orthoVerifier = new OrthonormalityVerifier(); // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts(); // FIXME: Why is the above vector computed if it is never used? Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens(); List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData); saveCleanEigens(conf, prunedEigenMeta); }
public DistributedRowMatrix transpose() throws IOException { Path outputPath = new Path(rowPath.getParent(), "transpose-" + (System.nanoTime() & 0xFF)); Configuration initialConf = getConf() == null ? new Configuration() : getConf(); Configuration conf = TransposeJob.buildTransposeJobConf(initialConf, rowPath, outputPath, numRows); JobClient.runJob(new JobConf(conf)); DistributedRowMatrix m = new DistributedRowMatrix(outputPath, outputTmpPath, numCols, numRows); m.setConf(this.conf); return m; }
/** * Refer to {@link ReconstructionErrJob} for explanation of the job. In short: * * <p>X = Y * Y2X * * <p>Err = (X - Xm) * C' - (Y - Ym) * * @param matrixY the input matrix Y * @param matrixY2X the in-memory matrix to generate X * @param matrixC the in-memory matrix to reconstruct Y * @param C_central the central version of matrixC * @param Ym the mean vector of Y * @param Xm = Ym * matrixY2X * @param conf the configuration * @param tmpPath the temporary path * @param id the unique id to name the files in HDFS * @return the norm-2 of the the Err matrix * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public double reconstructionErr( DistributedRowMatrix matrixY, DistributedRowMatrix matrixY2X, DistributedRowMatrix matrixC, Matrix C_central, Vector Ym, DenseVector Xm, final float ERR_SAMPLE_RATE, Configuration conf, Path tmpPath, String id) throws IOException, InterruptedException, ClassNotFoundException { DenseVector Zm = new DenseVector(C_central.numRows()); PCACommon.vectorTimesMatrixTranspose(Xm, (DenseMatrix) C_central, Zm); Zm = (DenseVector) Zm.minus(Ym); Path resPath = new Path(tmpPath, "reconstructionErr" + id); FileSystem fs = FileSystem.get(resPath.toUri(), conf); if (!fs.exists(resPath)) { Path ZmPath = PCACommon.toDistributedVector(Zm, tmpPath, "Zm" + id, conf); Path YmPath = PCACommon.toDistributedVector(Ym, tmpPath, "Ymforerr" + id, conf); run( conf, matrixY.getRowPath(), matrixY2X.getRowPath(), matrixY2X.numRows(), matrixY2X.numCols(), matrixC.getRowPath(), ZmPath.toString(), YmPath.toString(), resPath, ERR_SAMPLE_RATE); } else { log.warn("---------- Skip ReconstructionErrJob - already exists: " + resPath); } loadResults(resPath, conf); log.info("0 is reconstruction err, 1 is Y norm (err/norm), " + "2 is Y-Ym norm (err/norm)"); log.info("The error of 0 is " + reconstructionError); log.info("The error of 1 is " + yNorm + " (" + reconstructionError / yNorm + ")"); log.info( "The error of 2 is " + centralizedYNorm + " (" + reconstructionError / centralizedYNorm + ")"); double error = reconstructionError / centralizedYNorm; return error; }
/** * This implements matrix this.transpose().times(other) * * @param other a DistributedRowMatrix * @param outPath path to write result to * @return a DistributedRowMatrix containing the product */ public DistributedRowMatrix times(DistributedRowMatrix other, Path outPath) throws IOException { if (numRows != other.numRows()) { throw new CardinalityException(numRows, other.numRows()); } Configuration initialConf = getConf() == null ? new Configuration() : getConf(); Configuration conf = MatrixMultiplicationJob.createMatrixMultiplyJobConf( initialConf, rowPath, other.rowPath, outPath, other.numCols); JobClient.runJob(new JobConf(conf)); DistributedRowMatrix out = new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols()); out.setConf(conf); return out; }
/** Factored-out LanczosSolver for the purpose of invoking it programmatically */ public LanczosState runJob( Configuration originalConfig, Path inputPath, Path outputTmpPath, int numRows, int numCols, boolean isSymmetric, int desiredRank, String outputEigenVectorPathString) throws IOException { DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols); matrix.setConf(new Configuration(originalConfig)); LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix)); return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString); }
/** * This implements matrix this.transpose().times(other) * * @param other a DistributedRowMatrix * @return a DistributedRowMatrix containing the product */ public DistributedRowMatrix times(DistributedRowMatrix other) throws IOException { if (numRows != other.numRows()) { throw new CardinalityException(numRows, other.numRows()); } Path outPath = new Path(outputTmpBasePath.getParent(), "productWith-" + (System.nanoTime() & 0xFF)); Configuration initialConf = getConf() == null ? new Configuration() : getConf(); Configuration conf = MatrixMultiplicationJob.createMatrixMultiplyJobConf( initialConf, rowPath, other.rowPath, outPath, other.numCols); JobClient.runJob(new JobConf(conf)); DistributedRowMatrix out = new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols()); out.setConf(conf); return out; }
private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) { DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1); eigens.setConf(conf); if (inMemory) { List<Vector> eigenVectors = Lists.newArrayList(); for (MatrixSlice slice : eigens) { eigenVectors.add(slice.vector()); } eigensToVerify = new SparseRowMatrix( eigenVectors.size(), eigenVectors.get(0).size(), eigenVectors.toArray(new Vector[eigenVectors.size()]), true, true); } else { eigensToVerify = eigens; } }
/** * Run the job with the given arguments * * @param corpusInput the corpus input Path * @param eigenInput the eigenvector input Path * @param output the output Path * @param tempOut temporary output Path * @param maxError a double representing the maximum error * @param minEigenValue a double representing the minimum eigenvalue * @param inMemory a boolean requesting in-memory preparation * @param conf the Configuration to use, or null if a default is ok (saves referencing * Configuration in calling classes unless needed) */ public int run( Path corpusInput, Path eigenInput, Path output, Path tempOut, double maxError, double minEigenValue, boolean inMemory, Configuration conf) throws IOException { this.outPath = output; this.tmpOut = tempOut; this.maxError = maxError; this.minEigenValue = minEigenValue; if (eigenInput != null && eigensToVerify == null) { prepareEigens(conf, eigenInput, inMemory); } DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1); c.setConf(conf); corpus = c; // set up eigenverifier and orthoverifier TODO: allow multithreaded execution eigenVerifier = new SimpleEigenVerifier(); // we don't currently verify orthonormality here. // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts(); Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens(); List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData); saveCleanEigens(new Configuration(), prunedEigenMeta); return 0; }