@Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get rdd and broadcast inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> inX =
        sec.getBinaryBlockRDDHandleForVariable(_input1.getName());
    PartitionedBroadcastMatrix inV = sec.getBroadcastForVariable(_input2.getName());

    // execute mapmmchain (guaranteed to have single output block)
    MatrixBlock out = null;
    if (_chainType == ChainType.XtXv) {
      RDDMapMMChainFunction fmmc = new RDDMapMMChainFunction(inV);
      JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapValues(fmmc);
      out = RDDAggregateUtils.sumStable(tmp);
    } else { // ChainType.XtwXv
      PartitionedBroadcastMatrix inW = sec.getBroadcastForVariable(_input3.getName());
      RDDMapMMChainFunction2 fmmc = new RDDMapMMChainFunction2(inV, inW);
      JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapToPair(fmmc);
      out = RDDAggregateUtils.sumStable(tmp);
    }

    // put output block into symbol table (no lineage because single block)
    // this also includes implicit maintenance of matrix characteristics
    sec.setMatrixOutput(_output.getName(), out);
  }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String itervar,
      String matrixvar,
      String program,
      String resultFile,
      MatrixObject input,
      ExecutionContext ec,
      PDataPartitionFormat dpf,
      OutputInfo oi,
      boolean tSparseCol, // config params
      boolean enableCPCaching,
      int numReducers) // opt params
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // prepare input parameters
    MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData();
    MatrixCharacteristics mc = md.getMatrixCharacteristics();
    InputInfo ii = InputInfo.BinaryBlockInputInfo;

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf);
    RemoteDPParForSparkWorker efun =
        new RemoteDPParForSparkWorker(
            program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        in.flatMapToPair(dpfun) // partition the input blocks
            .groupByKey(numReducers) // group partition blocks 		
            .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
  /**
   * @param vector
   * @param singleColBlock
   * @param dense
   * @param unknownDims
   */
  private void testDataFrameConversion(
      ValueType[] schema, boolean containsID, boolean dense, boolean unknownDims) {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;

    SparkExecutionContext sec = null;

    try {
      DMLScript.USE_LOCAL_SPARK_CONFIG = true;
      DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;

      // generate input data and setup metadata
      int cols = schema.length + colsVector - 1;
      double sparsity = dense ? sparsity1 : sparsity2;
      double[][] A = TestUtils.round(getRandomMatrix(rows1, cols, -10, 1000, sparsity, 2373));
      MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
      int blksz = ConfigurationManager.getBlocksize();
      MatrixCharacteristics mc1 =
          new MatrixCharacteristics(rows1, cols, blksz, blksz, mbA.getNonZeros());
      MatrixCharacteristics mc2 =
          unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

      // setup spark context
      sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
      JavaSparkContext sc = sec.getSparkContext();
      SQLContext sqlctx = new SQLContext(sc);

      // create input data frame
      DataFrame df = createDataFrame(sqlctx, mbA, containsID, schema);

      // dataframe - frame conversion
      JavaPairRDD<Long, FrameBlock> out =
          FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, containsID);

      // get output frame block
      FrameBlock fbB =
          SparkExecutionContext.toFrameBlock(
              out, UtilFunctions.nCopies(cols, ValueType.DOUBLE), rows1, cols);

      // compare frame blocks
      MatrixBlock mbB = DataConverter.convertToMatrixBlock(fbB);
      double[][] B = DataConverter.convertToDoubleMatrix(mbB);
      TestUtils.compareMatrices(A, B, rows1, cols, eps);
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    } finally {
      sec.close();
      DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
      DMLScript.rtplatform = oldPlatform;
    }
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    String rddVar = (_type == CacheType.LEFT) ? input2.getName() : input1.getName();
    String bcastVar = (_type == CacheType.LEFT) ? input1.getName() : input2.getName();
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(output.getName());
    long rlen =
        sec.getScalarInput(_nrow.getName(), _nrow.getValueType(), _nrow.isLiteral()).getLongValue();

    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    PartitionedBroadcastMatrix in2 = sec.getBroadcastForVariable(bcastVar);

    // execute pmm instruction
    JavaPairRDD<MatrixIndexes, MatrixBlock> out =
        in1.flatMapToPair(new RDDPMMFunction(_type, in2, rlen, mc.getRowsPerBlock()));
    out = RDDAggregateUtils.sumByKeyStable(out);

    // put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), rddVar);
    sec.addLineageBroadcast(output.getName(), bcastVar);

    // update output statistics if not inferred
    updateBinaryMMOutputMatrixCharacteristics(sec, false);
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in =
        sec.getBinaryBlockRDDHandleForVariable(input1.getName());

    // execute unary builtin operation
    UnaryOperator uop = (UnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));

    // set output RDD
    updateUnaryOutputMatrixCharacteristics(sec);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    String rddVar = (_type == CacheType.LEFT) ? input2.getName() : input1.getName();
    String bcastVar = (_type == CacheType.LEFT) ? input1.getName() : input2.getName();
    MatrixCharacteristics mcRdd = sec.getMatrixCharacteristics(rddVar);
    MatrixCharacteristics mcBc = sec.getMatrixCharacteristics(bcastVar);

    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    PartitionedBroadcastMatrix in2 = sec.getBroadcastForVariable(bcastVar);

    // empty input block filter
    if (!_outputEmpty) in1 = in1.filter(new FilterNonEmptyBlocksFunction());

    // execute mapmult instruction
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    if (requiresFlatMapFunction(_type, mcBc))
      out = in1.flatMapToPair(new RDDFlatMapMMFunction(_type, in2));
    else if (preservesPartitioning(mcRdd, _type))
      out = in1.mapPartitionsToPair(new RDDMapMMPartitionFunction(_type, in2), true);
    else out = in1.mapToPair(new RDDMapMMFunction(_type, in2));

    // empty output block filter
    if (!_outputEmpty) out = out.filter(new FilterNonEmptyBlocksFunction());

    // perform aggregation if necessary and put output into symbol table
    if (_aggtype == SparkAggType.SINGLE_BLOCK) {
      MatrixBlock out2 = RDDAggregateUtils.sumStable(out);

      // put output block into symbol table (no lineage because single block)
      // this also includes implicit maintenance of matrix characteristics
      sec.setMatrixOutput(output.getName(), out2);
    } else // MULTI_BLOCK or NONE
    {
      if (_aggtype == SparkAggType.MULTI_BLOCK) out = RDDAggregateUtils.sumByKeyStable(out);

      // put output RDD handle into symbol table
      sec.setRDDHandleForVariable(output.getName(), out);
      sec.addLineageRDD(output.getName(), rddVar);
      sec.addLineageBroadcast(output.getName(), bcastVar);

      // update output statistics if not inferred
      updateBinaryMMOutputMatrixCharacteristics(sec, true);
    }
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in =
        sec.getBinaryBlockRDDHandleForVariable(input1.getName());

    // execute unary aggregate (w/ implicit drop correction)
    AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out =
        in.mapToPair(new RDDCumAggFunction(auop, rlen, brlen, bclen));
    out = RDDAggregateUtils.mergeByKey(out);

    // put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
  }
  /**
   * This will check if there is sufficient memory locally (twice the size of second matrix, for
   * original and sort data), and remotely (size of second matrix (sorted data)).
   *
   * @return true if sufficient memory
   */
  private boolean isUnaryAggregateOuterSPRewriteApplicable() {
    boolean ret = false;
    Hop input = getInput().get(0);

    if (input instanceof BinaryOp && ((BinaryOp) input).isOuterVectorOperator()) {
      // note: both cases (partitioned matrix, and sorted double array), require to
      // fit the broadcast twice into the local memory budget. Also, the memory
      // constraint only needs to take the rhs into account because the output is
      // guaranteed to be an aggregate of <=16KB

      Hop right = input.getInput().get(1);

      double size =
          right.dimsKnown()
              ? OptimizerUtils.estimateSize(right.getDim1(), right.getDim2())
              : // dims known and estimate fits
              right.getOutputMemEstimate(); // dims unknown but worst-case estimate fits

      if (_op == AggOp.MAXINDEX || _op == AggOp.MININDEX) {
        double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
        double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

        // basic requirement: the broadcast needs to to fit twice in the remote broadcast memory
        // and local memory budget because we have to create a partitioned broadcast
        // memory and hand it over to the spark context as in-memory object
        ret = (2 * size < memBudgetExec && 2 * size < memBudgetLocal);

      } else {
        if (OptimizerUtils.checkSparkBroadcastMemoryBudget(size)) {
          ret = true;
        }
      }
    }

    return ret;
  }