/**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param _enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      String taskFile,
      String resultFile,
      MatrixObject colocatedDPMatrixObj, // inputs
      boolean enableCPCaching,
      int numMappers,
      int replication,
      int max_retry,
      long minMem,
      boolean jvmReuse) // opt params
      throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    // maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
      /////
      // configure the MR job

      // set arbitrary CP program blocks that will perform in the mapper
      MRJobConfiguration.setProgramBlocks(job, program);

      // enable/disable caching
      MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

      // set mappers, reducers, combiners
      job.setMapperClass(RemoteParWorkerMapper.class); // map-only

      // set input format (one split per row, NLineInputFormat default N=1)
      if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
        job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
        MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
        MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
        MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
        MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
        MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
      } else // default case
      {
        job.setInputFormat(NLineInputFormat.class);
      }

      // set the input path and output path
      FileInputFormat.setInputPaths(job, new Path(taskFile));

      // set output format
      job.setOutputFormat(SequenceFileOutputFormat.class);

      // set output path
      MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
      FileOutputFormat.setOutputPath(job, new Path(resultFile));

      // set the output key, value schema
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);
      job.setOutputKeyClass(LongWritable.class);
      job.setOutputValueClass(Text.class);

      //////
      // set optimization parameters

      // set the number of mappers and reducers
      job.setNumMapTasks(numMappers); // numMappers
      job.setNumReduceTasks(0);
      // job.setInt("mapred.map.tasks.maximum", 1); //system property
      // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
      // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

      // use FLEX scheduler configuration properties
      if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
        job.setInt("flex.priority", 0); // highest

        job.setInt("flex.map.min", 0);
        job.setInt("flex.map.max", numMappers);
        job.setInt("flex.reduce.min", 0);
        job.setInt("flex.reduce.max", numMappers);
      }

      // set jvm memory size (if require)
      String memKey = "mapred.child.java.opts";
      if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
        InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
        LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
      }

      // disable automatic tasks timeouts and speculative task exec
      job.setInt("mapred.task.timeout", 0);
      job.setMapSpeculativeExecution(false);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // enables the reuse of JVMs (multiple tasks per MR task)
      if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited

      // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
      job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB

      // set the replication factor for the results
      job.setInt("dfs.replication", replication);

      // set the max number of retries per map task
      //  disabled job-level configuration to respect cluster configuration
      //  note: this refers to hadoop2, hence it never had effect on mr1
      // job.setInt("mapreduce.map.maxattempts", max_retry);

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      /////
      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      // Process different counters
      Statistics.incrementNoOfExecutedMRJobs();
      Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
      int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
      int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
      if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
        Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
        Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
        Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
        Group cgroup =
            runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
        CacheStatistics.incrementMemHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
        CacheStatistics.incrementFSBuffHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
        CacheStatistics.incrementFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
        CacheStatistics.incrementHDFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
        CacheStatistics.incrementFSBuffWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
        CacheStatistics.incrementFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
        CacheStatistics.incrementHDFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
        CacheStatistics.incrementAcquireRTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
        CacheStatistics.incrementAcquireMTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
        CacheStatistics.incrementReleaseTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
        CacheStatistics.incrementExportTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
      }

      // read all files of result variables and prepare for return
      LocalVariableMap[] results = readResultFile(job, resultFile);

      ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
      throw new DMLRuntimeException(ex);
    } finally {
      // remove created files
      try {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
      } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
      }
    }

    if (DMLScript.STATISTICS) {
      long t1 = System.nanoTime();
      Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
  }
  /**
   * Starts a Rand MapReduce job which will produce one or more random objects.
   *
   * @param numRows number of rows for each random object
   * @param numCols number of columns for each random object
   * @param blockRowSize number of rows in a block for each random object
   * @param blockColSize number of columns in a block for each random object
   * @param minValue minimum of the random values for each random object
   * @param maxValue maximum of the random values for each random object
   * @param sparsity sparsity for each random object
   * @param pdf probability density function for each random object
   * @param replication file replication
   * @param inputs input file for each random object
   * @param outputs output file for each random object
   * @param outputInfos output information for each random object
   * @param instructionsInMapper instruction for each random object
   * @param resultIndexes result indexes for each random object
   * @return matrix characteristics for each random object
   * @throws Exception if an error occurred in the MapReduce phase
   */
  public static JobReturn runJob(
      MRJobInstruction inst,
      String[] dataGenInstructions,
      String instructionsInMapper,
      String aggInstructionsInReducer,
      String otherInstructionsInReducer,
      int numReducers,
      int replication,
      byte[] resultIndexes,
      String dimsUnknownFilePrefix,
      String[] outputs,
      OutputInfo[] outputInfos)
      throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
      dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

      MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
      MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
      DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

      rlens[i] = genInst.getRows();
      clens[i] = genInst.getCols();
      brlens[i] = genInst.getRowsInBlock();
      bclens[i] = genInst.getColsInBlock();

      maxbrlen = Math.max(maxbrlen, brlens[i]);
      maxbclen = Math.max(maxbclen, bclens[i]);

      if (mrtype == MRINSTRUCTION_TYPE.Rand) {
        RandInstruction randInst = (RandInstruction) mrins;
        inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
        maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);

        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();

        // seed generation
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
        long[] nnz =
            LibMatrixDatagen.computeNNZperBlock(
                rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
        int nnzIx = 0;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
          for (long c = 0; c < clens[i]; c += bclens[i]) {
            long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

            sb.append((r / brlens[i]) + 1);
            sb.append(',');
            sb.append((c / bclens[i]) + 1);
            sb.append(',');
            sb.append(curBlockRowSize);
            sb.append(',');
            sb.append(curBlockColSize);
            sb.append(',');
            sb.append(nnz[nnzIx++]);
            sb.append(',');
            sb.append(bigrand.nextLong());
            pw.println(sb.toString());
            sb.setLength(0);
            numblocks++;
          }
        }
        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
        SeqInstruction seqInst = (SeqInstruction) mrins;
        inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
        maxsparsity = 1.0; // always dense

        double from = seqInst.fromValue;
        double to = seqInst.toValue;
        double incr = seqInst.incrValue;

        // handle default 1 to -1 for special case of from>to
        incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

        // Correctness checks on (from, to, incr)
        boolean neg = (from > to);
        if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

        if (neg != (incr < 0))
          throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

        // Compute the number of rows in the sequence
        long numrows = 1 + (long) Math.floor((to - from) / incr);
        if (rlens[i] > 0) {
          if (numrows != rlens[i])
            throw new DMLRuntimeException(
                "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                    + rlens[i]
                    + " != "
                    + numrows);
        } else {
          rlens[i] = numrows;
        }

        if (clens[i] > 0 && clens[i] != 1)
          throw new DMLRuntimeException(
              "Unexpected error while processing sequence instruction. Number of columns ("
                  + clens[i]
                  + ") must be equal to 1.");
        else clens[i] = 1;

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);
        StringBuilder sb = new StringBuilder();

        double temp = from;
        double block_from, block_to;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

          // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to]
          // (inclusive of both end points of the interval)
          long bid_i = ((r / brlens[i]) + 1);
          long bid_j = 1;
          block_from = temp;
          block_to = temp + (curBlockRowSize - 1) * incr;
          temp = block_to + incr; // next block starts from here

          sb.append(bid_i);
          sb.append(',');
          sb.append(bid_j);
          sb.append(',');
          /*
          // Need not include block size while generating seq()
          sb.append(curBlockRowSize);
          sb.append(',');
          sb.append(1);
          sb.append(',');*/
          sb.append(block_from);
          sb.append(',');
          sb.append(block_to);
          sb.append(',');
          sb.append(incr);

          pw.println(sb.toString());
          // System.out.println("MapTask " + r + ": " + sb.toString());
          sb.setLength(0);
          numblocks++;
        }

        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else {
        throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
      }
    }
    dataGenInsStr = dataGenInsStr.substring(1); // remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the input files and their format information
      MRJobConfiguration.setUpMultipleInputs(
          job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);

      // set up the dimensions of input matrices
      MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
      MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the rand Instructions
      MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

      // set up unary instructions that will perform in the mapper
      MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

      // set up the aggregate instructions that will happen in the combiner and reducer
      MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

      // set up the instructions that will happen in the reducer, after the aggregation instrucions
      MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

      // set up the replication factor for the results
      job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getDMLConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // set up custom map/reduce configurations
      MRJobConfiguration.setupCustomMRConfigurations(job, config);

      // determine degree of parallelism (nmappers: 1<=n<=capacity)
      // TODO use maxsparsity whenever we have a way of generating sparse rand data
      int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
      long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
      // correction max number of mappers on yarn clusters
      if (InfrastructureAnalyzer.isYarnEnabled())
        capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
      int nmapers =
          Math.max(
              Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity),
              1);
      job.setNumMapTasks(nmapers);

      // set up what matrices are needed to pass from the mapper to reducer
      HashSet<Byte> mapoutputIndexes =
          MRJobConfiguration.setUpOutputIndexesForMapper(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              otherInstructionsInReducer,
              resultIndexes);

      MatrixChar_N_ReducerGroups ret =
          MRJobConfiguration.computeMatrixCharacteristics(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              null,
              otherInstructionsInReducer,
              resultIndexes,
              mapoutputIndexes,
              false);
      stats = ret.stats;

      // set up the number of reducers
      MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

      // print the complete MRJob instruction
      if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats);

      // Update resultDimsUnknown based on computed "stats"
      byte[] resultDimsUnknown = new byte[resultIndexes.length];
      for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
          resultDimsUnknown[i] = (byte) 1;
        } else {
          resultDimsUnknown[i] = (byte) 0;
        }
      }

      boolean mayContainCtable =
          instructionsInMapper.contains("ctabletransform")
              || instructionsInMapper.contains("groupedagg");

      // set up the multiple output files, and their format information
      MRJobConfiguration.setUpMultipleOutputs(
          job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);

      // configure mapper and the mapper output key value pairs
      job.setMapperClass(DataGenMapper.class);
      if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
      } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
      }

      // set up combiner
      if (numReducers != 0
          && aggInstructionsInReducer != null
          && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class);

      // configure reducer
      job.setReducerClass(GMRReducer.class);
      // job.setReducerClass(PassThroughReducer.class);

      // By default, the job executes in "cluster" mode.
      // Determine if we can optimize and run it in "local" mode.
      MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
      for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
      }

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      runjob = JobClient.runJob(job);

      /* Process different counters */

      Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
      for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
      }

      String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
      stats = MapReduceTool.processDimsFiles(dir, stats);
      MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
      for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
  }