Beispiel #1
0
 @Override
 public void setup(Context context) throws IOException {
   Configuration conf = context.getConfiguration();
   Path cMemMatrixPath = new Path(conf.get(RECONSTRUCTIONMATRIX));
   Path dMemMatrixPath = new Path(conf.get(MATRIXY2X));
   Path zmPath = new Path(conf.get(ZMPATH));
   Path meanPath = new Path(conf.get(YMPATH));
   int inMemMatrixNumRows = conf.getInt(YCOLS, 0);
   int inMemMatrixNumCols = conf.getInt(XCOLS, 0);
   ERR_SAMPLE_RATE = conf.getFloat(ERRSAMPLERATE, 1);
   Path tmpPath = cMemMatrixPath.getParent();
   DistributedRowMatrix distMatrix =
       new DistributedRowMatrix(cMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols);
   distMatrix.setConf(conf);
   matrixC = PCACommon.toDenseMatrix(distMatrix);
   distMatrix =
       new DistributedRowMatrix(dMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols);
   distMatrix.setConf(conf);
   matrixY2X = PCACommon.toDenseMatrix(distMatrix);
   try {
     zm = PCACommon.toDenseVector(zmPath, conf);
     ym = PCACommon.toDenseVector(meanPath, conf);
   } catch (IOException e) {
     e.printStackTrace();
   }
   xiCt = new DenseVector(matrixC.numRows());
   sumOfErr = new DenseVector(matrixC.numRows());
   sumOfyi = new DenseVector(matrixC.numRows());
   sumOfyc = new DenseVector(matrixC.numRows());
 }
  /**
   * Progammatic invocation of run()
   *
   * @param eigenInput Output of LanczosSolver
   * @param corpusInput Input of LanczosSolver
   */
  public void runJob(
      Configuration conf,
      Path eigenInput,
      Path corpusInput,
      Path output,
      boolean inMemory,
      double maxError,
      int maxEigens)
      throws IOException {
    // no need to handle command line arguments
    outPath = output;
    tmpOut = new Path(outPath, "tmp");
    maxEigensToKeep = maxEigens;
    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
    saveCleanEigens(conf, prunedEigenMeta);
  }
  /**
   * Run the solver to produce the raw eigenvectors
   *
   * @param inputPath the Path to the input corpus
   * @param outputPath the Path to the output
   * @param outputTmpPath a Path to a temporary working directory
   * @param numRows the int number of rows
   * @param numCols the int number of columns
   * @param isSymmetric true if the input matrix is symmetric
   * @param desiredRank the int desired rank of eigenvectors to produce
   * @return an int indicating success (0) or otherwise
   */
  public int run(
      Path inputPath,
      Path outputPath,
      Path outputTmpPath,
      Path workingDirPath,
      int numRows,
      int numCols,
      boolean isSymmetric,
      int desiredRank)
      throws Exception {
    DistributedRowMatrix matrix =
        new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));

    LanczosState state;
    if (workingDirPath == null) {
      state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);

    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);
    serializeOutput(state, outputEigenVectorPath);
    return 0;
  }
  @Test
  public void testSolver() throws Exception {
    Configuration conf = getConfiguration();
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix matrix =
        new TestDistributedRowMatrix()
            .randomDistributedMatrix(10, 10, 10, 10, 10.0, true, testData.toString());
    matrix.setConf(conf);
    Path output = getTestTempFilePath("output");
    Path vectorPath = getTestTempFilePath("vector");
    Path tempPath = getTestTempDirPath("tmp");

    Vector vector = randomVector(matrix.numCols(), 10.0);
    saveVector(conf, vectorPath, vector);

    String[] args = {
      "-i", matrix.getRowPath().toString(),
      "-o", output.toString(),
      "--tempDir", tempPath.toString(),
      "--vector", vectorPath.toString(),
      "--numRows", "10",
      "--numCols", "10",
      "--symmetric", "true"
    };

    DistributedConjugateGradientSolver solver = new DistributedConjugateGradientSolver();
    ToolRunner.run(getConfiguration(), solver.job(), args);

    Vector x = loadVector(conf, output);

    Vector solvedVector = matrix.times(x);
    double distance = Math.sqrt(vector.getDistanceSquared(solvedVector));
    assertEquals(0.0, distance, EPSILON);
  }
  /**
   * Progammatic invocation of run()
   *
   * @param eigenInput Output of LanczosSolver
   * @param corpusInput Input of LanczosSolver
   */
  public void runJob(
      Configuration conf,
      Path eigenInput,
      Path corpusInput,
      Path output,
      boolean inMemory,
      double maxError,
      double minEigenValue,
      int maxEigens)
      throws IOException {
    // no need to handle command line arguments
    outPath = output;
    tmpOut = new Path(outPath, "tmp");
    maxEigensToKeep = maxEigens;
    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();
    // OrthonormalityVerifier orthoVerifier = new OrthonormalityVerifier();
    // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();
    // FIXME: Why is the above vector computed if it is never used?

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
    saveCleanEigens(conf, prunedEigenMeta);
  }
 public DistributedRowMatrix transpose() throws IOException {
   Path outputPath = new Path(rowPath.getParent(), "transpose-" + (System.nanoTime() & 0xFF));
   Configuration initialConf = getConf() == null ? new Configuration() : getConf();
   Configuration conf =
       TransposeJob.buildTransposeJobConf(initialConf, rowPath, outputPath, numRows);
   JobClient.runJob(new JobConf(conf));
   DistributedRowMatrix m = new DistributedRowMatrix(outputPath, outputTmpPath, numCols, numRows);
   m.setConf(this.conf);
   return m;
 }
  /**
   * This implements matrix this.transpose().times(other)
   *
   * @param other a DistributedRowMatrix
   * @param outPath path to write result to
   * @return a DistributedRowMatrix containing the product
   */
  public DistributedRowMatrix times(DistributedRowMatrix other, Path outPath) throws IOException {
    if (numRows != other.numRows()) {
      throw new CardinalityException(numRows, other.numRows());
    }

    Configuration initialConf = getConf() == null ? new Configuration() : getConf();
    Configuration conf =
        MatrixMultiplicationJob.createMatrixMultiplyJobConf(
            initialConf, rowPath, other.rowPath, outPath, other.numCols);
    JobClient.runJob(new JobConf(conf));
    DistributedRowMatrix out =
        new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols());
    out.setConf(conf);
    return out;
  }
 /** Factored-out LanczosSolver for the purpose of invoking it programmatically */
 public LanczosState runJob(
     Configuration originalConfig,
     Path inputPath,
     Path outputTmpPath,
     int numRows,
     int numCols,
     boolean isSymmetric,
     int desiredRank,
     String outputEigenVectorPathString)
     throws IOException {
   DistributedRowMatrix matrix =
       new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
   matrix.setConf(new Configuration(originalConfig));
   LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
   return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
 }
  /**
   * This implements matrix this.transpose().times(other)
   *
   * @param other a DistributedRowMatrix
   * @return a DistributedRowMatrix containing the product
   */
  public DistributedRowMatrix times(DistributedRowMatrix other) throws IOException {
    if (numRows != other.numRows()) {
      throw new CardinalityException(numRows, other.numRows());
    }
    Path outPath =
        new Path(outputTmpBasePath.getParent(), "productWith-" + (System.nanoTime() & 0xFF));

    Configuration initialConf = getConf() == null ? new Configuration() : getConf();
    Configuration conf =
        MatrixMultiplicationJob.createMatrixMultiplyJobConf(
            initialConf, rowPath, other.rowPath, outPath, other.numCols);
    JobClient.runJob(new JobConf(conf));
    DistributedRowMatrix out =
        new DistributedRowMatrix(outPath, outputTmpPath, numCols, other.numCols());
    out.setConf(conf);
    return out;
  }
  private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.setConf(conf);
    if (inMemory) {
      List<Vector> eigenVectors = Lists.newArrayList();
      for (MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }
      eigensToVerify =
          new SparseRowMatrix(
              eigenVectors.size(),
              eigenVectors.get(0).size(),
              eigenVectors.toArray(new Vector[eigenVectors.size()]),
              true,
              true);

    } else {
      eigensToVerify = eigens;
    }
  }
  /**
   * Run the job with the given arguments
   *
   * @param corpusInput the corpus input Path
   * @param eigenInput the eigenvector input Path
   * @param output the output Path
   * @param tempOut temporary output Path
   * @param maxError a double representing the maximum error
   * @param minEigenValue a double representing the minimum eigenvalue
   * @param inMemory a boolean requesting in-memory preparation
   * @param conf the Configuration to use, or null if a default is ok (saves referencing
   *     Configuration in calling classes unless needed)
   */
  public int run(
      Path corpusInput,
      Path eigenInput,
      Path output,
      Path tempOut,
      double maxError,
      double minEigenValue,
      boolean inMemory,
      Configuration conf)
      throws IOException {
    this.outPath = output;
    this.tmpOut = tempOut;
    this.maxError = maxError;
    this.minEigenValue = minEigenValue;

    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(conf, eigenInput, inMemory);
    }
    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
    c.setConf(conf);
    corpus = c;

    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution

    eigenVerifier = new SimpleEigenVerifier();

    // we don't currently verify orthonormality here.
    // VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();

    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();

    List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);

    saveCleanEigens(new Configuration(), prunedEigenMeta);
    return 0;
  }