public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   if (mo.getGPUObject() == null || !mo.getGPUObject().isAllocated()) {
     throw new DMLRuntimeException("No output is allocated on GPU");
   }
   mo.getGPUObject().releaseOutput();
 }
 public MatrixObject getDenseMatrixOutputForGPUInstruction(String varName)
     throws DMLRuntimeException {
   MatrixObject mo = allocateGPUMatrixObject(varName);
   mo.getGPUObject().acquireDeviceModifyDense();
   mo.getMatrixCharacteristics().setNonZeros(-1);
   return mo;
 }
  /**
   * Pin a given list of variables i.e., set the "clean up" state in corresponding matrix objects,
   * so that the cached data inside these objects is not cleared and the corresponding HDFS files
   * are not deleted (through rmvar instructions).
   *
   * <p>This is necessary for: function input variables, parfor result variables, parfor shared
   * inputs that are passed to functions.
   *
   * <p>The function returns the OLD "clean up" state of matrix objects.
   *
   * @param varList variable list
   * @return map of old cleanup state of matrix objects
   */
  public HashMap<String, Boolean> pinVariables(ArrayList<String> varList) {
    // 2-pass approach since multiple vars might refer to same matrix object
    HashMap<String, Boolean> varsState = new HashMap<String, Boolean>();

    // step 1) get current information
    for (String var : varList) {
      Data dat = _variables.get(var);
      if (dat instanceof MatrixObject) {
        MatrixObject mo = (MatrixObject) dat;
        varsState.put(var, mo.isCleanupEnabled());
        // System.out.println("pre-pin "+var+" ("+mo.isCleanupEnabled()+")");
      }
    }

    // step 2) pin variables
    for (String var : varList) {
      Data dat = _variables.get(var);
      if (dat instanceof MatrixObject) {
        MatrixObject mo = (MatrixObject) dat;
        mo.enableCleanup(false);
        // System.out.println("pin "+var);
      }
    }

    return varsState;
  }
  /**
   * @param c
   * @param vars
   * @return
   * @throws DMLRuntimeException
   */
  private static LiteralOp replaceLiteralFullUnaryAggregate(Hop c, LocalVariableMap vars)
      throws DMLRuntimeException {
    LiteralOp ret = null;

    // full unary aggregate w/ matrix less than 10^6 cells
    if (c instanceof AggUnaryOp
        && isReplaceableUnaryAggregate((AggUnaryOp) c)
        && c.getInput().get(0) instanceof DataOp
        && vars.keySet().contains(c.getInput().get(0).getName())) {
      Hop data = c.getInput().get(0);
      MatrixObject mo = (MatrixObject) vars.get(data.getName());

      // get the dimension information from the matrix object because the hop
      // dimensions might not have been updated during recompile
      if (mo.getNumRows() * mo.getNumColumns() < REPLACE_LITERALS_MAX_MATRIX_SIZE) {
        MatrixBlock mBlock = mo.acquireRead();
        double value = replaceUnaryAggregate((AggUnaryOp) c, mBlock);
        mo.release();

        // literal substitution (always double)
        ret = new LiteralOp(value);
      }
    }

    return ret;
  }
  /**
   * @param hop
   * @param vars
   * @return
   * @throws DMLRuntimeException
   */
  private static long getIntValueDataLiteral(Hop hop, LocalVariableMap vars)
      throws DMLRuntimeException {
    long value = -1;

    try {
      if (hop instanceof LiteralOp) {
        value = HopRewriteUtils.getIntValue((LiteralOp) hop);
      } else if (hop instanceof UnaryOp && ((UnaryOp) hop).getOp() == OpOp1.NROW) {
        // get the dimension information from the matrix object because the hop
        // dimensions might not have been updated during recompile
        MatrixObject mo = (MatrixObject) vars.get(hop.getInput().get(0).getName());
        value = mo.getNumRows();
      } else if (hop instanceof UnaryOp && ((UnaryOp) hop).getOp() == OpOp1.NCOL) {
        // get the dimension information from the matrix object because the hop
        // dimensions might not have been updated during recompile
        MatrixObject mo = (MatrixObject) vars.get(hop.getInput().get(0).getName());
        value = mo.getNumColumns();
      } else {
        ScalarObject sdat = (ScalarObject) vars.get(hop.getName());
        value = sdat.getLongValue();
      }
    } catch (HopsException ex) {
      throw new DMLRuntimeException("Failed to get int value for literal replacement", ex);
    }

    return value;
  }
  /**
   * @param c
   * @param vars
   * @return
   * @throws DMLRuntimeException
   */
  private static LiteralOp replaceLiteralDataTypeCastMatrixRead(Hop c, LocalVariableMap vars)
      throws DMLRuntimeException {
    LiteralOp ret = null;

    // as.scalar/matrix read - literal replacement
    if (c instanceof UnaryOp
        && ((UnaryOp) c).getOp() == OpOp1.CAST_AS_SCALAR
        && c.getInput().get(0) instanceof DataOp
        && c.getInput().get(0).getDataType() == DataType.MATRIX) {
      Data dat = vars.get(c.getInput().get(0).getName());
      if (dat != null) // required for selective constant propagation
      {
        // cast as scalar (see VariableCPInstruction)
        MatrixObject mo = (MatrixObject) dat;
        MatrixBlock mBlock = mo.acquireRead();
        if (mBlock.getNumRows() != 1 || mBlock.getNumColumns() != 1)
          throw new DMLRuntimeException(
              "Dimension mismatch - unable to cast matrix of dimension ("
                  + mBlock.getNumRows()
                  + " x "
                  + mBlock.getNumColumns()
                  + ") to scalar.");
        double value = mBlock.getValue(0, 0);
        mo.release();

        // literal substitution (always double)
        ret = new LiteralOp(value);
      }
    }

    return ret;
  }
 /**
  * Allocates the {@link GPUObject} for a given LOPS Variable (eg. _mVar3)
  *
  * @param varName variable name
  * @return matrix object
  * @throws DMLRuntimeException if DMLRuntimeException occurs
  */
 public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   if (mo.getGPUObject() == null) {
     mo.setGPUObject(GPUContext.createGPUObject(mo));
   }
   return mo;
 }
  /**
   * Function to perform LU decomposition on a given matrix.
   *
   * @param in matrix object
   * @return array of matrix blocks
   * @throws DMLRuntimeException if DMLRuntimeException occurs
   */
  private static MatrixBlock[] computeLU(MatrixObject in) throws DMLRuntimeException {
    if (in.getNumRows() != in.getNumColumns()) {
      throw new DMLRuntimeException(
          "LU Decomposition can only be done on a square matrix. Input matrix is rectangular (rows="
              + in.getNumRows()
              + ", cols="
              + in.getNumColumns()
              + ")");
    }

    Array2DRowRealMatrix matrixInput = DataConverter.convertToArray2DRowRealMatrix(in);

    // Perform LUP decomposition
    LUDecomposition ludecompose = new LUDecomposition(matrixInput);
    RealMatrix P = ludecompose.getP();
    RealMatrix L = ludecompose.getL();
    RealMatrix U = ludecompose.getU();

    // Read the results into native format
    MatrixBlock mbP = DataConverter.convertToMatrixBlock(P.getData());
    MatrixBlock mbL = DataConverter.convertToMatrixBlock(L.getData());
    MatrixBlock mbU = DataConverter.convertToMatrixBlock(U.getData());

    return new MatrixBlock[] {mbP, mbL, mbU};
  }
  public void setMatrixOutput(String varName, MatrixBlock outputData, UpdateType flag)
      throws DMLRuntimeException {
    if (flag.isInPlace()) {
      // modify metadata to carry update status
      MatrixObject mo = getMatrixObject(varName);
      mo.setUpdateType(flag);
    }

    // default case
    setMatrixOutput(varName, outputData);
  }
  /**
   * @param c
   * @param vars
   * @return
   * @throws DMLRuntimeException
   */
  private static LiteralOp replaceLiteralFullUnaryAggregateRightIndexing(
      Hop c, LocalVariableMap vars) throws DMLRuntimeException {
    LiteralOp ret = null;

    // full unary aggregate w/ indexed matrix less than 10^6 cells
    if (c instanceof AggUnaryOp
        && isReplaceableUnaryAggregate((AggUnaryOp) c)
        && c.getInput().get(0) instanceof IndexingOp
        && c.getInput().get(0).getInput().get(0) instanceof DataOp) {
      IndexingOp rix = (IndexingOp) c.getInput().get(0);
      Hop data = rix.getInput().get(0);
      Hop rl = rix.getInput().get(1);
      Hop ru = rix.getInput().get(2);
      Hop cl = rix.getInput().get(3);
      Hop cu = rix.getInput().get(4);

      if (data instanceof DataOp
          && vars.keySet().contains(data.getName())
          && isIntValueDataLiteral(rl, vars)
          && isIntValueDataLiteral(ru, vars)
          && isIntValueDataLiteral(cl, vars)
          && isIntValueDataLiteral(cu, vars)) {
        long rlval = getIntValueDataLiteral(rl, vars);
        long ruval = getIntValueDataLiteral(ru, vars);
        long clval = getIntValueDataLiteral(cl, vars);
        long cuval = getIntValueDataLiteral(cu, vars);

        MatrixObject mo = (MatrixObject) vars.get(data.getName());

        // get the dimension information from the matrix object because the hop
        // dimensions might not have been updated during recompile
        if (mo.getNumRows() * mo.getNumColumns() < REPLACE_LITERALS_MAX_MATRIX_SIZE) {
          MatrixBlock mBlock = mo.acquireRead();
          MatrixBlock mBlock2 =
              mBlock.sliceOperations(
                  (int) (rlval - 1),
                  (int) (ruval - 1),
                  (int) (clval - 1),
                  (int) (cuval - 1),
                  new MatrixBlock());
          double value = replaceUnaryAggregate((AggUnaryOp) c, mBlock2);
          mo.release();

          // literal substitution (always double)
          ret = new LiteralOp(value);
        }
      }
    }

    return ret;
  }
 /**
  * Unpin the a given list of variables by setting their "cleanup" status to the values specified
  * by <code>varsStats</code>.
  *
  * <p>Typical usage: <code>
  *    oldStatus = pinVariables(varList);
  *    ...
  *    unpinVariables(varList, oldStatus);
  *    </code> i.e., a call to unpinVariables() is preceded by pinVariables().
  *
  * @param varList variable list
  * @param varsState variable state
  */
 public void unpinVariables(ArrayList<String> varList, HashMap<String, Boolean> varsState) {
   for (String var : varList) {
     // System.out.println("unpin "+var+" ("+varsState.get(var)+")");
     Data dat = _variables.get(var);
     if (dat instanceof MatrixObject) ((MatrixObject) dat).enableCleanup(varsState.get(var));
   }
 }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String itervar,
      String matrixvar,
      String program,
      String resultFile,
      MatrixObject input,
      ExecutionContext ec,
      PDataPartitionFormat dpf,
      OutputInfo oi,
      boolean tSparseCol, // config params
      boolean enableCPCaching,
      int numReducers) // opt params
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // prepare input parameters
    MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData();
    MatrixCharacteristics mc = md.getMatrixCharacteristics();
    InputInfo ii = InputInfo.BinaryBlockInputInfo;

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf);
    RemoteDPParForSparkWorker efun =
        new RemoteDPParForSparkWorker(
            program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        in.flatMapToPair(dpfun) // partition the input blocks
            .groupByKey(numReducers) // group partition blocks 		
            .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
  /**
   * @param c
   * @param vars
   * @return
   * @throws DMLRuntimeException
   */
  private static LiteralOp replaceLiteralValueTypeCastRightIndexing(Hop c, LocalVariableMap vars)
      throws DMLRuntimeException {
    LiteralOp ret = null;

    // as.scalar/right indexing w/ literals/vars and matrix less than 10^6 cells
    if (c instanceof UnaryOp
        && ((UnaryOp) c).getOp() == OpOp1.CAST_AS_SCALAR
        && c.getInput().get(0) instanceof IndexingOp
        && c.getInput().get(0).getDataType() == DataType.MATRIX) {
      IndexingOp rix = (IndexingOp) c.getInput().get(0);
      Hop data = rix.getInput().get(0);
      Hop rl = rix.getInput().get(1);
      Hop ru = rix.getInput().get(2);
      Hop cl = rix.getInput().get(3);
      Hop cu = rix.getInput().get(4);
      if (rix.dimsKnown()
          && rix.getDim1() == 1
          && rix.getDim2() == 1
          && data instanceof DataOp
          && vars.keySet().contains(data.getName())
          && isIntValueDataLiteral(rl, vars)
          && isIntValueDataLiteral(ru, vars)
          && isIntValueDataLiteral(cl, vars)
          && isIntValueDataLiteral(cu, vars)) {
        long rlval = getIntValueDataLiteral(rl, vars);
        long clval = getIntValueDataLiteral(cl, vars);

        MatrixObject mo = (MatrixObject) vars.get(data.getName());

        // get the dimension information from the matrix object because the hop
        // dimensions might not have been updated during recompile
        if (mo.getNumRows() * mo.getNumColumns() < REPLACE_LITERALS_MAX_MATRIX_SIZE) {
          MatrixBlock mBlock = mo.acquireRead();
          double value = mBlock.getValue((int) rlval - 1, (int) clval - 1);
          mo.release();

          // literal substitution (always double)
          ret = new LiteralOp(value);
        }
      }
    }

    return ret;
  }
  /**
   * Function to perform Eigen decomposition on a given matrix. Input must be a symmetric matrix.
   *
   * @param in matrix object
   * @return array of matrix blocks
   * @throws DMLRuntimeException if DMLRuntimeException occurs
   */
  private static MatrixBlock[] computeEigen(MatrixObject in) throws DMLRuntimeException {
    if (in.getNumRows() != in.getNumColumns()) {
      throw new DMLRuntimeException(
          "Eigen Decomposition can only be done on a square matrix. Input matrix is rectangular (rows="
              + in.getNumRows()
              + ", cols="
              + in.getNumColumns()
              + ")");
    }

    Array2DRowRealMatrix matrixInput = DataConverter.convertToArray2DRowRealMatrix(in);

    EigenDecomposition eigendecompose = new EigenDecomposition(matrixInput);
    RealMatrix eVectorsMatrix = eigendecompose.getV();
    double[][] eVectors = eVectorsMatrix.getData();
    double[] eValues = eigendecompose.getRealEigenvalues();

    // Sort the eigen values (and vectors) in increasing order (to be compatible w/ LAPACK.DSYEVR())
    int n = eValues.length;
    for (int i = 0; i < n; i++) {
      int k = i;
      double p = eValues[i];
      for (int j = i + 1; j < n; j++) {
        if (eValues[j] < p) {
          k = j;
          p = eValues[j];
        }
      }
      if (k != i) {
        eValues[k] = eValues[i];
        eValues[i] = p;
        for (int j = 0; j < n; j++) {
          p = eVectors[j][i];
          eVectors[j][i] = eVectors[j][k];
          eVectors[j][k] = p;
        }
      }
    }

    MatrixBlock mbValues = DataConverter.convertToMatrixBlock(eValues, true);
    MatrixBlock mbVectors = DataConverter.convertToMatrixBlock(eVectors);

    return new MatrixBlock[] {mbValues, mbVectors};
  }
 public void cleanupMatrixObject(MatrixObject mo) throws DMLRuntimeException {
   try {
     if (mo.isCleanupEnabled()) {
       // compute ref count only if matrix cleanup actually necessary
       if (!getVariables().hasReferences(mo)) {
         // clean cached data
         mo.clearData();
         if (mo.isHDFSFileExists()) {
           // clean hdfs data
           String fpath = mo.getFileName();
           if (fpath != null) {
             MapReduceTool.deleteFileIfExistOnHDFS(fpath);
             MapReduceTool.deleteFileIfExistOnHDFS(fpath + ".mtd");
           }
         }
       }
     }
   } catch (Exception ex) {
     throw new DMLRuntimeException(ex);
   }
 }
 public MatrixObject getMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   if (mo == null) {
     throw new DMLRuntimeException("No matrix object available for variable:" + varName);
   }
   if (mo.getGPUObject() == null) {
     mo.setGPUObject(GPUContext.createGPUObject(mo));
   }
   boolean acquired = false;
   if (!mo.getGPUObject().isAllocated()) {
     mo.acquireRead();
     acquired = true;
   }
   mo.getGPUObject().acquireDeviceRead();
   if (acquired) {
     mo.release();
   }
   return mo;
 }
  public void setMetaData(String varName, long nrows, long ncols) throws DMLRuntimeException {
    MatrixObject mo = getMatrixObject(varName);
    if (mo.getNumRows() == nrows && mo.getNumColumns() == ncols) return;

    MetaData oldMetaData = mo.getMetaData();
    if (oldMetaData == null || !(oldMetaData instanceof MatrixFormatMetaData))
      throw new DMLRuntimeException("Metadata not available");

    MatrixCharacteristics mc =
        new MatrixCharacteristics(
            (long) nrows,
            (long) ncols,
            (int) mo.getNumRowsPerBlock(),
            (int) mo.getNumColumnsPerBlock());
    mo.setMetaData(
        new MatrixFormatMetaData(
            mc,
            ((MatrixFormatMetaData) oldMetaData).getOutputInfo(),
            ((MatrixFormatMetaData) oldMetaData).getInputInfo()));
  }
 public static boolean isInSparseFormat(MatrixObject mo) {
   if (mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
     return mo.getGPUObject().isInSparseFormat();
   return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
 }
  public static void matmult(
      MatrixObject left1,
      MatrixObject right1,
      MatrixObject output,
      boolean isLeftTransposed1,
      boolean isRightTransposed1)
      throws DMLRuntimeException {
    if (isInSparseFormat(left1) || isInSparseFormat(right1)) {
      throw new DMLRuntimeException("Sparse GPU matrix multiplication is not implemented");
    }

    // Since CuBLAS expects inputs in column-major format,
    // reverse the order of matrix-multiplication and take care of dimension mismatch.
    MatrixObject left = right1;
    MatrixObject right = left1;
    boolean isLeftTransposed = isRightTransposed1;
    boolean isRightTransposed = isLeftTransposed1;

    char transa = isLeftTransposed ? 'T' : 'N';
    char transb = isRightTransposed ? 'T' : 'N';
    // Note: the dimensions are swapped
    int m = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
    int n = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
    int k = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
    int k1 = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
    if (k != k1) throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);

    if (m == -1 || n == -1 || k == -1) throw new DMLRuntimeException("Incorrect dimensions");

    double alpha = 1;
    double beta = 0;

    int lda = isLeftTransposed ? k : m;
    int ldb = isRightTransposed ? n : k;
    int ldc = m;

    if (!left.getGPUObject().isAllocated() || !right.getGPUObject().isAllocated())
      throw new DMLRuntimeException(
          "One of input is not allocated:"
              + left.getGPUObject().isAllocated()
              + " "
              + right.getGPUObject().isAllocated());
    if (!output.getGPUObject().isAllocated())
      throw new DMLRuntimeException(
          "Output is not allocated:" + output.getGPUObject().isAllocated());

    Pointer A = ((JCudaObject) left.getGPUObject()).jcudaPointer;
    Pointer B = ((JCudaObject) right.getGPUObject()).jcudaPointer;
    Pointer C = ((JCudaObject) output.getGPUObject()).jcudaPointer;

    JCublas.cublasDgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
  }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param _enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      String taskFile,
      String resultFile,
      MatrixObject colocatedDPMatrixObj, // inputs
      boolean enableCPCaching,
      int numMappers,
      int replication,
      int max_retry,
      long minMem,
      boolean jvmReuse) // opt params
      throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    // maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
      /////
      // configure the MR job

      // set arbitrary CP program blocks that will perform in the mapper
      MRJobConfiguration.setProgramBlocks(job, program);

      // enable/disable caching
      MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

      // set mappers, reducers, combiners
      job.setMapperClass(RemoteParWorkerMapper.class); // map-only

      // set input format (one split per row, NLineInputFormat default N=1)
      if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
        job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
        MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
        MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
        MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
        MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
        MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
      } else // default case
      {
        job.setInputFormat(NLineInputFormat.class);
      }

      // set the input path and output path
      FileInputFormat.setInputPaths(job, new Path(taskFile));

      // set output format
      job.setOutputFormat(SequenceFileOutputFormat.class);

      // set output path
      MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
      FileOutputFormat.setOutputPath(job, new Path(resultFile));

      // set the output key, value schema
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);
      job.setOutputKeyClass(LongWritable.class);
      job.setOutputValueClass(Text.class);

      //////
      // set optimization parameters

      // set the number of mappers and reducers
      job.setNumMapTasks(numMappers); // numMappers
      job.setNumReduceTasks(0);
      // job.setInt("mapred.map.tasks.maximum", 1); //system property
      // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
      // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

      // use FLEX scheduler configuration properties
      if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
        job.setInt("flex.priority", 0); // highest

        job.setInt("flex.map.min", 0);
        job.setInt("flex.map.max", numMappers);
        job.setInt("flex.reduce.min", 0);
        job.setInt("flex.reduce.max", numMappers);
      }

      // set jvm memory size (if require)
      String memKey = "mapred.child.java.opts";
      if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
        InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
        LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
      }

      // disable automatic tasks timeouts and speculative task exec
      job.setInt("mapred.task.timeout", 0);
      job.setMapSpeculativeExecution(false);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // enables the reuse of JVMs (multiple tasks per MR task)
      if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited

      // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
      job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB

      // set the replication factor for the results
      job.setInt("dfs.replication", replication);

      // set the max number of retries per map task
      //  disabled job-level configuration to respect cluster configuration
      //  note: this refers to hadoop2, hence it never had effect on mr1
      // job.setInt("mapreduce.map.maxattempts", max_retry);

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      /////
      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      // Process different counters
      Statistics.incrementNoOfExecutedMRJobs();
      Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
      int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
      int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
      if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
        Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
        Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
        Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
        Group cgroup =
            runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
        CacheStatistics.incrementMemHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
        CacheStatistics.incrementFSBuffHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
        CacheStatistics.incrementFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
        CacheStatistics.incrementHDFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
        CacheStatistics.incrementFSBuffWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
        CacheStatistics.incrementFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
        CacheStatistics.incrementHDFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
        CacheStatistics.incrementAcquireRTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
        CacheStatistics.incrementAcquireMTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
        CacheStatistics.incrementReleaseTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
        CacheStatistics.incrementExportTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
      }

      // read all files of result variables and prepare for return
      LocalVariableMap[] results = readResultFile(job, resultFile);

      ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
      throw new DMLRuntimeException(ex);
    } finally {
      // remove created files
      try {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
      } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
      }
    }

    if (DMLScript.STATISTICS) {
      long t1 = System.nanoTime();
      Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
  }
 /**
  * Pins a matrix variable into memory and returns the internal matrix block.
  *
  * @param varName variable name
  * @return matrix block
  * @throws DMLRuntimeException if DMLRuntimeException occurs
  */
 public MatrixBlock getMatrixInput(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   return mo.acquireRead();
 }
 public void setMatrixOutput(String varName, MatrixBlock outputData) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   mo.acquireModify(outputData);
   mo.release();
   setVariable(varName, mo);
 }
 public void releaseMatrixInputForGPUInstruction(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   mo.getGPUObject().releaseInput();
 }
 /**
  * Unpins a currently pinned matrix variable.
  *
  * @param varName variable name
  * @throws DMLRuntimeException if DMLRuntimeException occurs
  */
 public void releaseMatrixInput(String varName) throws DMLRuntimeException {
   MatrixObject mo = getMatrixObject(varName);
   mo.release();
 }