  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    String rddVar = (_type == CacheType.LEFT) ? input2.getName() : input1.getName();
    String bcastVar = (_type == CacheType.LEFT) ? input1.getName() : input2.getName();
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(output.getName());
    long rlen =
        sec.getScalarInput(_nrow.getName(), _nrow.getValueType(), _nrow.isLiteral()).getLongValue();

    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    PartitionedBroadcastMatrix in2 = sec.getBroadcastForVariable(bcastVar);

    // execute pmm instruction
    JavaPairRDD<MatrixIndexes, MatrixBlock> out =
        in1.flatMapToPair(new RDDPMMFunction(_type, in2, rlen, mc.getRowsPerBlock()));
    out = RDDAggregateUtils.sumByKeyStable(out);

    // put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), rddVar);
    sec.addLineageBroadcast(output.getName(), bcastVar);

    // update output statistics if not inferred
    updateBinaryMMOutputMatrixCharacteristics(sec, false);
  protected long[] inferOutputCharacteristics(MemoTable memo) {
    long[] ret = null;

    Hop input = getInput().get(0);
    MatrixCharacteristics mc = memo.getAllInputStats(input);
    if (_direction == Direction.Col && mc.colsKnown()) ret = new long[] {1, mc.getCols(), -1};
    else if (_direction == Direction.Row && mc.rowsKnown()) ret = new long[] {mc.getRows(), 1, -1};

    return ret;
  public void computeMemEstimate(MemoTable memo) {
    // default behavior

    // try to infer via worstcase input statistics (for the case of dims known
    // but nnz initially unknown)
    MatrixCharacteristics mcM1 = memo.getAllInputStats(getInput().get(0));
    if (dimsKnown() && mcM1.getNonZeros() >= 0) {
      long lnnz = mcM1.getNonZeros(); // worst-case output nnz
      double lOutMemEst = computeOutputMemEstimate(_dim1, _dim2, lnnz);
      if (lOutMemEst < _outputMemEstimate) {
        _outputMemEstimate = lOutMemEst;
        _memEstimate = getInputOutputSize();
  protected long[] inferOutputCharacteristics(MemoTable memo) {
    long[] ret = null;

    Hop input = getInput().get(0);
    MatrixCharacteristics mc = memo.getAllInputStats(input);
    if (mc.dimsKnown()) {
      if (_op == OpOp1.ABS
          || _op == OpOp1.COS
          || _op == OpOp1.SIN
          || _op == OpOp1.TAN
          || _op == OpOp1.ACOS
          || _op == OpOp1.ASIN
          || _op == OpOp1.ATAN
          || _op == OpOp1.SQRT
          || _op == OpOp1.ROUND
          || _op == OpOp1.SPROP
          || _op == OpOp1.SELP) // sparsity preserving
        ret = new long[] {mc.getRows(), mc.getCols(), mc.getNonZeros()};
      } else ret = new long[] {mc.getRows(), mc.getCols(), -1};

    return ret;
  protected long[] inferOutputCharacteristics(MemoTable memo) {
    long[] ret = null;

    Hop input = getInput().get(0); // original matrix
    MatrixCharacteristics mc = memo.getAllInputStats(input);
    if (mc != null) {
      long lnnz = mc.dimsKnown() ? Math.min(mc.getRows() * mc.getCols(), mc.getNonZeros()) : -1;
      // worst-case is input size, but dense
      ret = new long[] {mc.getRows(), mc.getCols(), lnnz};

      // exploit column/row indexing information
      if (_rowLowerEqualsUpper) ret[0] = 1;
      if (_colLowerEqualsUpper) ret[1] = 1;

      // infer tight block indexing size
      Hop rl = getInput().get(1);
      Hop ru = getInput().get(2);
      Hop cl = getInput().get(3);
      Hop cu = getInput().get(4);
      if (isBlockIndexingExpression(rl, ru)) ret[0] = getBlockIndexingExpressionSize(rl, ru);
      if (isBlockIndexingExpression(cl, cu)) ret[1] = getBlockIndexingExpressionSize(cl, cu);

    return ret;
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in =

    // execute unary aggregate (w/ implicit drop correction)
    AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out =
        in.mapToPair(new RDDCumAggFunction(auop, rlen, brlen, bclen));
    out = RDDAggregateUtils.mergeByKey(out);

    // put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
  * @param type
  * @param mcBc
  * @return
 private static boolean requiresFlatMapFunction(CacheType type, MatrixCharacteristics mcBc) {
   return (type == CacheType.LEFT && mcBc.getRows() > mcBc.getRowsPerBlock())
       || (type == CacheType.RIGHT && mcBc.getCols() > mcBc.getColsPerBlock());
  * @param mcIn
  * @param type
  * @return
 private static boolean preservesPartitioning(MatrixCharacteristics mcIn, CacheType type) {
   if (type == CacheType.LEFT) return mcIn.dimsKnown() && mcIn.getRows() <= mcIn.getRowsPerBlock();
   else // RIGHT
   return mcIn.dimsKnown() && mcIn.getCols() <= mcIn.getColsPerBlock();
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param _enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      String taskFile,
      String resultFile,
      MatrixObject colocatedDPMatrixObj, // inputs
      boolean enableCPCaching,
      int numMappers,
      int replication,
      int max_retry,
      long minMem,
      boolean jvmReuse) // opt params
      throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    // maintain dml script counters

    try {
      // configure the MR job

      // set arbitrary CP program blocks that will perform in the mapper
      MRJobConfiguration.setProgramBlocks(job, program);

      // enable/disable caching
      MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

      // set mappers, reducers, combiners
      job.setMapperClass(RemoteParWorkerMapper.class); // map-only

      // set input format (one split per row, NLineInputFormat default N=1)
      if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
        MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
        MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
        MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
        MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
        MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
      } else // default case

      // set the input path and output path
      FileInputFormat.setInputPaths(job, new Path(taskFile));

      // set output format

      // set output path
      FileOutputFormat.setOutputPath(job, new Path(resultFile));

      // set the output key, value schema

      // set optimization parameters

      // set the number of mappers and reducers
      job.setNumMapTasks(numMappers); // numMappers
      // job.setInt("mapred.map.tasks.maximum", 1); //system property
      // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
      // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

      // use FLEX scheduler configuration properties
      if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
        job.setInt("flex.priority", 0); // highest

        job.setInt("flex.map.min", 0);
        job.setInt("flex.map.max", numMappers);
        job.setInt("flex.reduce.min", 0);
        job.setInt("flex.reduce.max", numMappers);

      // set jvm memory size (if require)
      String memKey = "mapred.child.java.opts";
      if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
        InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
        LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");

      // disable automatic tasks timeouts and speculative task exec
      job.setInt("mapred.task.timeout", 0);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // enables the reuse of JVMs (multiple tasks per MR task)
      if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited

      // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
      job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB

      // set the replication factor for the results
      job.setInt("dfs.replication", replication);

      // set the max number of retries per map task
      //  disabled job-level configuration to respect cluster configuration
      //  note: this refers to hadoop2, hence it never had effect on mr1
      // job.setInt("mapreduce.map.maxattempts", max_retry);

      // set unique working dir

      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      // Process different counters
      Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
      int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
      int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
      if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
        Group cgroup =
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));

      // read all files of result variables and prepare for return
      LocalVariableMap[] results = readResultFile(job, resultFile);

      ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
      throw new DMLRuntimeException(ex);
    } finally {
      // remove created files
      try {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
      } catch (IOException ex) {
        throw new DMLRuntimeException(ex);

    if (DMLScript.STATISTICS) {
      long t1 = System.nanoTime();
      Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);

    return ret;
 public void setMatrixCharacteristics(MatrixCharacteristics mcIn) {
   _mcIn = mcIn;
   _lastRowBlockIndex = (long) Math.ceil((double) _mcIn.getRows() / _mcIn.getRowsPerBlock());
  public void computeMatrixCharacteristics(
      MatrixCharacteristics mc1,
      MatrixCharacteristics mc2,
      MatrixCharacteristics mc3,
      MatrixCharacteristics dimOut) {
    QuaternaryOperator qop = (QuaternaryOperator) optr;

    if (qop.wtype1 != null || qop.wtype4 != null) { // wsloss/wcemm
      // output size independent of chain type (scalar)
      dimOut.set(1, 1, mc1.getRowsPerBlock(), mc1.getColsPerBlock());
    } else if (qop.wtype2 != null || qop.wtype5 != null) { // wsigmoid/wumm
      // output size determined by main input
      dimOut.set(mc1.getRows(), mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock());
    } else if (qop.wtype3 != null) { // wdivmm
      // note: cannot directly consume mc2 or mc3 for redwdivmm because rep instruction changed
      // the relevant dimensions; as a workaround the original dims are passed via nnz
      boolean mapwdivmm = _cacheU && _cacheV;
      long rank =
              ? mapwdivmm ? mc3.getCols() : mc3.getNonZeros()
              : mapwdivmm ? mc2.getCols() : mc2.getNonZeros();
      MatrixCharacteristics mcTmp =
          qop.wtype3.computeOutputCharacteristics(mc1.getRows(), mc1.getCols(), rank);
      dimOut.set(mcTmp.getRows(), mcTmp.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock());