Exemple #1
0
 @Override
 public void setup(Context context) throws IOException {
   Configuration conf = context.getConfiguration();
   Path cMemMatrixPath = new Path(conf.get(RECONSTRUCTIONMATRIX));
   Path dMemMatrixPath = new Path(conf.get(MATRIXY2X));
   Path zmPath = new Path(conf.get(ZMPATH));
   Path meanPath = new Path(conf.get(YMPATH));
   int inMemMatrixNumRows = conf.getInt(YCOLS, 0);
   int inMemMatrixNumCols = conf.getInt(XCOLS, 0);
   ERR_SAMPLE_RATE = conf.getFloat(ERRSAMPLERATE, 1);
   Path tmpPath = cMemMatrixPath.getParent();
   DistributedRowMatrix distMatrix =
       new DistributedRowMatrix(cMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols);
   distMatrix.setConf(conf);
   matrixC = PCACommon.toDenseMatrix(distMatrix);
   distMatrix =
       new DistributedRowMatrix(dMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols);
   distMatrix.setConf(conf);
   matrixY2X = PCACommon.toDenseMatrix(distMatrix);
   try {
     zm = PCACommon.toDenseVector(zmPath, conf);
     ym = PCACommon.toDenseVector(meanPath, conf);
   } catch (IOException e) {
     e.printStackTrace();
   }
   xiCt = new DenseVector(matrixC.numRows());
   sumOfErr = new DenseVector(matrixC.numRows());
   sumOfyi = new DenseVector(matrixC.numRows());
   sumOfyc = new DenseVector(matrixC.numRows());
 }
  /**
   * Progammatic invocation of run()
   *
   * @param eigenInput Output of LanczosSolver
   * @param corpusInput Input of LanczosSolver
   */
  public void runJob(
      Configuration conf,
      Path eigenInput,
      Path corpusInput,
      Path output,
      boolean inMemory,
      double maxError,
      int maxEigens)
      throws IOException {
    // no need to handle command line arguments
    outPath = output;
    tmpOut = new Path(outPath, "tmp");
    maxEigensToKeep = maxEigens;
    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
    saveCleanEigens(conf, prunedEigenMeta);
  }
  /**
   * Run the solver to produce the raw eigenvectors
   *
   * @param inputPath the Path to the input corpus
   * @param outputPath the Path to the output
   * @param outputTmpPath a Path to a temporary working directory
   * @param numRows the int number of rows
   * @param numCols the int number of columns
   * @param isSymmetric true if the input matrix is symmetric
   * @param desiredRank the int desired rank of eigenvectors to produce
   * @return an int indicating success (0) or otherwise
   */
  public int run(
      Path inputPath,
      Path outputPath,
      Path outputTmpPath,
      Path workingDirPath,
      int numRows,
      int numCols,
      boolean isSymmetric,
      int desiredRank)
      throws Exception {
    DistributedRowMatrix matrix =
        new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));

    LanczosState state;
    if (workingDirPath == null) {
      state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);

    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);
    serializeOutput(state, outputEigenVectorPath);
    return 0;
  }
  @Test
  public void testSolver() throws Exception {
    Configuration conf = getConfiguration();
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix matrix =
        new TestDistributedRowMatrix()
            .randomDistributedMatrix(10, 10, 10, 10, 10.0, true, testData.toString());
    matrix.setConf(conf);
    Path output = getTestTempFilePath("output");
    Path vectorPath = getTestTempFilePath("vector");
    Path tempPath = getTestTempDirPath("tmp");

    Vector vector = randomVector(matrix.numCols(), 10.0);
    saveVector(conf, vectorPath, vector);

    String[] args = {
      "-i", matrix.getRowPath().toString(),
      "-o", output.toString(),
      "--tempDir", tempPath.toString(),
      "--vector", vectorPath.toString(),
      "--numRows", "10",
      "--numCols", "10",
      "--symmetric", "true"
    };

    DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver();
    ToolRunner.run(getConfiguration(), solver.job(), args);

    Vector x = loadVector(conf, output);

    Vector solvedVector = matrix.times(x);
    double distance = Math.sqrt(vector.getDistanceSquared(solvedVector));
    assertEquals(0.0, distance, EPSILON);
  }
  /**
   * Progammatic invocation of run()
   *
   * @param eigenInput Output of LanczosSolver
   * @param corpusInput Input of LanczosSolver
   */
  public void runJob(
      Configuration conf,
      Path eigenInput,
      Path corpusInput,
      Path output,
      boolean inMemory,
      double maxError,
      double minEigenValue,
      int maxEigens)
      throws IOException {
    // no need to handle command line arguments
    outPath = output;
    tmpOut = new Path(outPath, "tmp");
    maxEigensToKeep = maxEigens;
    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();
    // OrthonormalityVerifier orthoVerifier = new OrthonormalityVerifier();
    // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();
    // FIXME: Why is the above vector computed if it is never used?

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
    saveCleanEigens(conf, prunedEigenMeta);
  }
 public DistributedRowMatrix transpose() throws IOException {
   Path outputPath = new Path(rowPath.getParent(), "transpose-" + (System.nanoTime() & 0xFF));
   Configuration initialConf = getConf() == null ? new Configuration() : getConf();
   Configuration conf =
       TransposeJob.buildTransposeJobConf(initialConf, rowPath, outputPath, numRows);
   JobClient.runJob(new JobConf(conf));
   DistributedRowMatrix m = new DistributedRowMatrix(outputPath, outputTmpPath, numCols, numRows);
   m.setConf(this.conf);
   return m;
 }
Exemple #7
0
  /**
   * Refer to {@link ReconstructionErrJob} for explanation of the job. In short:
   *
   * <p>X = Y * Y2X
   *
   * <p>Err = (X - Xm) * C' - (Y - Ym)
   *
   * @param matrixY the input matrix Y
   * @param matrixY2X the in-memory matrix to generate X
   * @param matrixC the in-memory matrix to reconstruct Y
   * @param C_central the central version of matrixC
   * @param Ym the mean vector of Y
   * @param Xm = Ym * matrixY2X
   * @param conf the configuration
   * @param tmpPath the temporary path
   * @param id the unique id to name the files in HDFS
   * @return the norm-2 of the the Err matrix
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  public double reconstructionErr(
      DistributedRowMatrix matrixY,
      DistributedRowMatrix matrixY2X,
      DistributedRowMatrix matrixC,
      Matrix C_central,
      Vector Ym,
      DenseVector Xm,
      final float ERR_SAMPLE_RATE,
      Configuration conf,
      Path tmpPath,
      String id)
      throws IOException, InterruptedException, ClassNotFoundException {
    DenseVector Zm = new DenseVector(C_central.numRows());
    PCACommon.vectorTimesMatrixTranspose(Xm, (DenseMatrix) C_central, Zm);
    Zm = (DenseVector) Zm.minus(Ym);

    Path resPath = new Path(tmpPath, "reconstructionErr" + id);
    FileSystem fs = FileSystem.get(resPath.toUri(), conf);
    if (!fs.exists(resPath)) {
      Path ZmPath = PCACommon.toDistributedVector(Zm, tmpPath, "Zm" + id, conf);
      Path YmPath = PCACommon.toDistributedVector(Ym, tmpPath, "Ymforerr" + id, conf);
      run(
          conf,
          matrixY.getRowPath(),
          matrixY2X.getRowPath(),
          matrixY2X.numRows(),
          matrixY2X.numCols(),
          matrixC.getRowPath(),
          ZmPath.toString(),
          YmPath.toString(),
          resPath,
          ERR_SAMPLE_RATE);
    } else {
      log.warn("---------- Skip ReconstructionErrJob - already exists: " + resPath);
    }
    loadResults(resPath, conf);

    log.info("0 is reconstruction err, 1 is Y norm (err/norm), " + "2 is Y-Ym norm (err/norm)");
    log.info("The error of 0 is " + reconstructionError);
    log.info("The error of 1 is " + yNorm + " (" + reconstructionError / yNorm + ")");
    log.info(
        "The error of 2 is "
            + centralizedYNorm
            + " ("
            + reconstructionError / centralizedYNorm
            + ")");
    double error = reconstructionError / centralizedYNorm;
    return error;
  }
  /**
   * This implements matrix this.transpose().times(other)
   *
   * @param other a DistributedRowMatrix
   * @param outPath path to write result to
   * @return a DistributedRowMatrix containing the product
   */
  public DistributedRowMatrix times(DistributedRowMatrix other, Path outPath) throws IOException {
    if (numRows != other.numRows()) {
      throw new CardinalityException(numRows, other.numRows());
    }

    Configuration initialConf = getConf() == null ? new Configuration() : getConf();
    Configuration conf =
        MatrixMultiplicationJob.createMatrixMultiplyJobConf(
            initialConf, rowPath, other.rowPath, outPath, other.numCols);
    JobClient.runJob(new JobConf(conf));
    DistributedRowMatrix out =
        new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols());
    out.setConf(conf);
    return out;
  }
 /** Factored-out LanczosSolver for the purpose of invoking it programmatically */
 public LanczosState runJob(
     Configuration originalConfig,
     Path inputPath,
     Path outputTmpPath,
     int numRows,
     int numCols,
     boolean isSymmetric,
     int desiredRank,
     String outputEigenVectorPathString)
     throws IOException {
   DistributedRowMatrix matrix =
       new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
   matrix.setConf(new Configuration(originalConfig));
   LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
   return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
 }
  /**
   * This implements matrix this.transpose().times(other)
   *
   * @param other a DistributedRowMatrix
   * @return a DistributedRowMatrix containing the product
   */
  public DistributedRowMatrix times(DistributedRowMatrix other) throws IOException {
    if (numRows != other.numRows()) {
      throw new CardinalityException(numRows, other.numRows());
    }
    Path outPath =
        new Path(outputTmpBasePath.getParent(), "productWith-" + (System.nanoTime() & 0xFF));

    Configuration initialConf = getConf() == null ? new Configuration() : getConf();
    Configuration conf =
        MatrixMultiplicationJob.createMatrixMultiplyJobConf(
            initialConf, rowPath, other.rowPath, outPath, other.numCols);
    JobClient.runJob(new JobConf(conf));
    DistributedRowMatrix out =
        new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols());
    out.setConf(conf);
    return out;
  }
  private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.setConf(conf);
    if (inMemory) {
      List<Vector> eigenVectors = Lists.newArrayList();
      for (MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }
      eigensToVerify =
          new SparseRowMatrix(
              eigenVectors.size(),
              eigenVectors.get(0).size(),
              eigenVectors.toArray(new Vector[eigenVectors.size()]),
              true,
              true);

    } else {
      eigensToVerify = eigens;
    }
  }
  /**
   * Run the job with the given arguments
   *
   * @param corpusInput the corpus input Path
   * @param eigenInput the eigenvector input Path
   * @param output the output Path
   * @param tempOut temporary output Path
   * @param maxError a double representing the maximum error
   * @param minEigenValue a double representing the minimum eigenvalue
   * @param inMemory a boolean requesting in-memory preparation
   * @param conf the Configuration to use, or null if a default is ok (saves referencing
   *     Configuration in calling classes unless needed)
   */
  public int run(
      Path corpusInput,
      Path eigenInput,
      Path output,
      Path tempOut,
      double maxError,
      double minEigenValue,
      boolean inMemory,
      Configuration conf)
      throws IOException {
    this.outPath = output;
    this.tmpOut = tempOut;
    this.maxError = maxError;
    this.minEigenValue = minEigenValue;

    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(conf, eigenInput, inMemory);
    }
    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
    c.setConf(conf);
    corpus = c;

    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution

    eigenVerifier = new SimpleEigenVerifier();

    // we don't currently verify orthonormality here.
    // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();

    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);

    saveCleanEigens(new Configuration(), prunedEigenMeta);
    return 0;
  }