Пример #1
0
  /**
   * Refer to {@link ReconstructionErrJob} for explanation of the job. In short:
   *
   * <p>X = Y * Y2X
   *
   * <p>Err = (X - Xm) * C' - (Y - Ym)
   *
   * @param matrixY the input matrix Y
   * @param matrixY2X the in-memory matrix to generate X
   * @param matrixC the in-memory matrix to reconstruct Y
   * @param C_central the central version of matrixC
   * @param Ym the mean vector of Y
   * @param Xm = Ym * matrixY2X
   * @param conf the configuration
   * @param tmpPath the temporary path
   * @param id the unique id to name the files in HDFS
   * @return the norm-2 of the the Err matrix
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  public double reconstructionErr(
      DistributedRowMatrix matrixY,
      DistributedRowMatrix matrixY2X,
      DistributedRowMatrix matrixC,
      Matrix C_central,
      Vector Ym,
      DenseVector Xm,
      final float ERR_SAMPLE_RATE,
      Configuration conf,
      Path tmpPath,
      String id)
      throws IOException, InterruptedException, ClassNotFoundException {
    DenseVector Zm = new DenseVector(C_central.numRows());
    PCACommon.vectorTimesMatrixTranspose(Xm, (DenseMatrix) C_central, Zm);
    Zm = (DenseVector) Zm.minus(Ym);

    Path resPath = new Path(tmpPath, "reconstructionErr" + id);
    FileSystem fs = FileSystem.get(resPath.toUri(), conf);
    if (!fs.exists(resPath)) {
      Path ZmPath = PCACommon.toDistributedVector(Zm, tmpPath, "Zm" + id, conf);
      Path YmPath = PCACommon.toDistributedVector(Ym, tmpPath, "Ymforerr" + id, conf);
      run(
          conf,
          matrixY.getRowPath(),
          matrixY2X.getRowPath(),
          matrixY2X.numRows(),
          matrixY2X.numCols(),
          matrixC.getRowPath(),
          ZmPath.toString(),
          YmPath.toString(),
          resPath,
          ERR_SAMPLE_RATE);
    } else {
      log.warn("---------- Skip ReconstructionErrJob - already exists: " + resPath);
    }
    loadResults(resPath, conf);

    log.info("0 is reconstruction err, 1 is Y norm (err/norm), " + "2 is Y-Ym norm (err/norm)");
    log.info("The error of 0 is " + reconstructionError);
    log.info("The error of 1 is " + yNorm + " (" + reconstructionError / yNorm + ")");
    log.info(
        "The error of 2 is "
            + centralizedYNorm
            + " ("
            + reconstructionError / centralizedYNorm
            + ")");
    double error = reconstructionError / centralizedYNorm;
    return error;
  }
  @Test
  public void testSolver() throws Exception {
    Configuration conf = getConfiguration();
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix matrix =
        new TestDistributedRowMatrix()
            .randomDistributedMatrix(10, 10, 10, 10, 10.0, true, testData.toString());
    matrix.setConf(conf);
    Path output = getTestTempFilePath("output");
    Path vectorPath = getTestTempFilePath("vector");
    Path tempPath = getTestTempDirPath("tmp");

    Vector vector = randomVector(matrix.numCols(), 10.0);
    saveVector(conf, vectorPath, vector);

    String[] args = {
      "-i", matrix.getRowPath().toString(),
      "-o", output.toString(),
      "--tempDir", tempPath.toString(),
      "--vector", vectorPath.toString(),
      "--numRows", "10",
      "--numCols", "10",
      "--symmetric", "true"
    };

    DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver();
    ToolRunner.run(getConfiguration(), solver.job(), args);

    Vector x = loadVector(conf, output);

    Vector solvedVector = matrix.times(x);
    double distance = Math.sqrt(vector.getDistanceSquared(solvedVector));
    assertEquals(0.0, distance, EPSILON);
  }