@Override
  public final void writeMatrixToHDFS(
      MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz)
      throws IOException, DMLRuntimeException {
    // validity check matrix dimensions
    if (src.getNumRows() != rlen || src.getNumColumns() != clen) {
      throw new IOException(
          "Matrix dimensions mismatch with metadata: "
              + src.getNumRows()
              + "x"
              + src.getNumColumns()
              + " vs "
              + rlen
              + "x"
              + clen
              + ".");
    }

    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    // if the file already exists on HDFS, remove it.
    MapReduceTool.deleteFileIfExistOnHDFS(fname);

    // core write (sequential/parallel)
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
  @Override
  public MatrixBlock readMatrixFromHDFS(
      String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
      throws IOException, DMLRuntimeException {
    // allocate output matrix block
    MatrixBlock ret =
        createOutputMatrixBlock(rlen, clen, (int) rlen, (int) clen, estnnz, true, false);

    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    // check existence and non-empty file
    checkValidInputFile(fs, path);

    // core read
    readBinaryCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen);

    // finally check if change of sparse/dense block representation required
    // (nnz maintained via append during read for both dense/sparse)
    ret.examSparsity();

    return ret;
  }
  @Override
  protected ExecType optFindExecType() throws HopsException {
    checkAndSetForcedPlatform();

    ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;

    if (_etypeForced != null) {
      _etype = _etypeForced;
    } else {
      if (OptimizerUtils.isMemoryBasedOptLevel()) {
        _etype = findExecTypeByMemEstimate();
      }
      // Choose CP, if the input dimensions are below threshold or if the input is a vector
      // Also, matrix inverse is currently implemented only in CP (through commons math)
      else if (getInput().get(0).areDimsBelowThreshold()
          || getInput().get(0).isVector()
          || isInMemoryOperation()) {
        _etype = ExecType.CP;
      } else {
        _etype = REMOTE;
      }

      // check for valid CP dimensions and matrix size
      checkAndSetInvalidCPDimsAndSize();
    }

    // spark-specific decision refinement (execute unary w/ spark input and
    // single parent also in spark because it's likely cheap and reduces intermediates)
    if (_etype == ExecType.CP
        && _etypeForced != ExecType.CP
        && getInput().get(0).optFindExecType() == ExecType.SPARK
        && getDataType().isMatrix()
        && !isCumulativeUnaryOperation()
        && !isCastUnaryOperation()
        && _op != OpOp1.MEDIAN
        && _op != OpOp1.IQM
        && !(getInput().get(0) instanceof DataOp) // input is not checkpoint
        && getInput().get(0).getParent().size() == 1) // unary is only parent
    {
      // pull unary operation into spark
      _etype = ExecType.SPARK;
    }

    // mark for recompile (forever)
    if (ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype == REMOTE)
      setRequiresRecompile();

    // ensure cp exec type for single-node operations
    if (_op == OpOp1.PRINT
        || _op == OpOp1.STOP
        || _op == OpOp1.INVERSE
        || _op == OpOp1.EIGEN
        || _op == OpOp1.CHOLESKY) {
      _etype = ExecType.CP;
    }

    return _etype;
  }
  @Override
  public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen)
      throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    MatrixBlock src = new MatrixBlock((int) rlen, 1, true);
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
  /**
   * @param vector
   * @param singleColBlock
   * @param dense
   * @param unknownDims
   */
  private void testDataFrameConversion(
      ValueType[] schema, boolean containsID, boolean dense, boolean unknownDims) {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;

    SparkExecutionContext sec = null;

    try {
      DMLScript.USE_LOCAL_SPARK_CONFIG = true;
      DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;

      // generate input data and setup metadata
      int cols = schema.length + colsVector - 1;
      double sparsity = dense ? sparsity1 : sparsity2;
      double[][] A = TestUtils.round(getRandomMatrix(rows1, cols, -10, 1000, sparsity, 2373));
      MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
      int blksz = ConfigurationManager.getBlocksize();
      MatrixCharacteristics mc1 =
          new MatrixCharacteristics(rows1, cols, blksz, blksz, mbA.getNonZeros());
      MatrixCharacteristics mc2 =
          unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

      // setup spark context
      sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
      JavaSparkContext sc = sec.getSparkContext();
      SQLContext sqlctx = new SQLContext(sc);

      // create input data frame
      DataFrame df = createDataFrame(sqlctx, mbA, containsID, schema);

      // dataframe - frame conversion
      JavaPairRDD<Long, FrameBlock> out =
          FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, containsID);

      // get output frame block
      FrameBlock fbB =
          SparkExecutionContext.toFrameBlock(
              out, UtilFunctions.nCopies(cols, ValueType.DOUBLE), rows1, cols);

      // compare frame blocks
      MatrixBlock mbB = DataConverter.convertToMatrixBlock(fbB);
      double[][] B = DataConverter.convertToDoubleMatrix(mbB);
      TestUtils.compareMatrices(A, B, rows1, cols, eps);
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    } finally {
      sec.close();
      DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
      DMLScript.rtplatform = oldPlatform;
    }
  }
Example #6
0
  private static void writeCleanupTasksToFile(Path path, int numTasks)
      throws DMLRuntimeException, IOException {
    BufferedWriter br = null;
    try {
      FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
      br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

      for (int i = 1; i <= numTasks; i++) br.write(String.valueOf("CLEANUP TASK " + i) + "\n");
    } catch (Exception ex) {
      throw new DMLRuntimeException(
          "Error writing cleanup tasks to taskfile " + path.toString(), ex);
    } finally {
      if (br != null) br.close();
    }
  }
  @Override
  protected ExecType optFindExecType() throws HopsException {

    checkAndSetForcedPlatform();

    ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;

    // forced / memory-based / threshold-based decision
    if (_etypeForced != null) {
      _etype = _etypeForced;
    } else {
      if (OptimizerUtils.isMemoryBasedOptLevel()) {
        _etype = findExecTypeByMemEstimate();
      }
      // Choose CP, if the input dimensions are below threshold or if the input is a vector
      else if (getInput().get(0).areDimsBelowThreshold() || getInput().get(0).isVector()) {
        _etype = ExecType.CP;
      } else {
        _etype = REMOTE;
      }

      // check for valid CP dimensions and matrix size
      checkAndSetInvalidCPDimsAndSize();
    }

    // spark-specific decision refinement (execute unary aggregate w/ spark input and
    // single parent also in spark because it's likely cheap and reduces data transfer)
    if (_etype == ExecType.CP
        && _etypeForced != ExecType.CP
        && !(getInput().get(0) instanceof DataOp) // input is not checkpoint
        && getInput().get(0).getParent().size() == 1 // uagg is only parent
        && getInput().get(0).optFindExecType() == ExecType.SPARK) {
      // pull unary aggregate into spark
      _etype = ExecType.SPARK;
    }

    // mark for recompile (forever)
    if (ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype == REMOTE) {
      setRequiresRecompile();
    }

    return _etype;
  }
  /**
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
      // simply move srcFile to destFile

      /*
       * TODO: Remove this roundabout way!
       * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv
       *              & the only path that exists already on HDFS is /user/biadmin/csv/.
       * In this case: the directory structure /user/biadmin/csv/temp/out must be created.
       * Simple hdfs.rename() does not seem to create this directory structure.
       */

      // delete the destination file, if exists already
      // boolean ret1 =
      hdfs.delete(destFilePath, true);

      // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
      // boolean ret2 =
      hdfs.createNewFile(destFilePath);

      // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
      // boolean ret3 =
      hdfs.delete(destFilePath, true);

      // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
      // boolean ret4 =
      hdfs.rename(srcFilePath, destFilePath);

      // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3
      // + ", rename:" + ret4);
      return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
      sb.append("C" + (i + 1));
      if (i < clen - 1) sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

      // compute sorted order among part files
      ArrayList<Path> files = new ArrayList<Path>();
      for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
        files.add(stat.getPath());
      Collections.sort(files);

      // first part file path
      Path firstpart = files.get(0);

      // create a temp file, and add header and contents of first part
      Path tmp = new Path(firstpart.toString() + ".tmp");
      OutputStream out = hdfs.create(tmp, true);
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy rest of the data from firstpart
      InputStream in = null;
      try {
        in = hdfs.open(firstpart);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }

      // rename tmp to firstpart
      hdfs.delete(firstpart, true);
      hdfs.rename(tmp, firstpart);

      // rename srcfile to destFile
      hdfs.delete(destFilePath, true);
      hdfs.createNewFile(destFilePath); // force the creation of directory structure
      hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
      hdfs.rename(srcFilePath, destFilePath); // move the data

    } else if (hdfs.isFile(srcFilePath)) {
      // create destination file
      OutputStream out = hdfs.create(destFilePath, true);

      // write header
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy the data from srcFile
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
  /**
   * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files
   * are created by CSV_WRITE MR job.
   *
   * <p>This method is invoked from CP-write instruction.
   *
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  public final void mergeCSVPartFiles(
      String srcFileName,
      String destFileName,
      CSVFileFormatProperties csvprop,
      long rlen,
      long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
      hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1) sb.append(csvprop.getDelim());
      }
      sb.append('\n');
      out.write(sb.toString().getBytes());
      sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
      try {
        FileStatus[] contents = hdfs.listStatus(srcFilePath);
        Path[] partPaths = new Path[contents.length];
        int numPartFiles = 0;
        for (int i = 0; i < contents.length; i++) {
          if (!contents[i].isDirectory()) {
            partPaths[i] = contents[i].getPath();
            numPartFiles++;
          }
        }
        Arrays.sort(partPaths);

        for (int i = 0; i < numPartFiles; i++) {
          InputStream in = hdfs.open(partPaths[i]);
          try {
            IOUtils.copyBytes(in, out, conf, false);
            if (i < numPartFiles - 1) out.write('\n');
          } finally {
            IOUtilFunctions.closeSilently(in);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(out);
      }
    } else if (hdfs.isFile(srcFilePath)) {
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param _enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      String taskFile,
      String resultFile,
      MatrixObject colocatedDPMatrixObj, // inputs
      boolean enableCPCaching,
      int numMappers,
      int replication,
      int max_retry,
      long minMem,
      boolean jvmReuse) // opt params
      throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    // maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
      /////
      // configure the MR job

      // set arbitrary CP program blocks that will perform in the mapper
      MRJobConfiguration.setProgramBlocks(job, program);

      // enable/disable caching
      MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

      // set mappers, reducers, combiners
      job.setMapperClass(RemoteParWorkerMapper.class); // map-only

      // set input format (one split per row, NLineInputFormat default N=1)
      if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
        job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
        MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
        MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
        MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
        MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
        MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
      } else // default case
      {
        job.setInputFormat(NLineInputFormat.class);
      }

      // set the input path and output path
      FileInputFormat.setInputPaths(job, new Path(taskFile));

      // set output format
      job.setOutputFormat(SequenceFileOutputFormat.class);

      // set output path
      MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
      FileOutputFormat.setOutputPath(job, new Path(resultFile));

      // set the output key, value schema
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);
      job.setOutputKeyClass(LongWritable.class);
      job.setOutputValueClass(Text.class);

      //////
      // set optimization parameters

      // set the number of mappers and reducers
      job.setNumMapTasks(numMappers); // numMappers
      job.setNumReduceTasks(0);
      // job.setInt("mapred.map.tasks.maximum", 1); //system property
      // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
      // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

      // use FLEX scheduler configuration properties
      if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
        job.setInt("flex.priority", 0); // highest

        job.setInt("flex.map.min", 0);
        job.setInt("flex.map.max", numMappers);
        job.setInt("flex.reduce.min", 0);
        job.setInt("flex.reduce.max", numMappers);
      }

      // set jvm memory size (if require)
      String memKey = "mapred.child.java.opts";
      if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
        InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
        LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
      }

      // disable automatic tasks timeouts and speculative task exec
      job.setInt("mapred.task.timeout", 0);
      job.setMapSpeculativeExecution(false);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // enables the reuse of JVMs (multiple tasks per MR task)
      if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited

      // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
      job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB

      // set the replication factor for the results
      job.setInt("dfs.replication", replication);

      // set the max number of retries per map task
      //  disabled job-level configuration to respect cluster configuration
      //  note: this refers to hadoop2, hence it never had effect on mr1
      // job.setInt("mapreduce.map.maxattempts", max_retry);

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      /////
      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      // Process different counters
      Statistics.incrementNoOfExecutedMRJobs();
      Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
      int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
      int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
      if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
        Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
        Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
        Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
        Group cgroup =
            runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
        CacheStatistics.incrementMemHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
        CacheStatistics.incrementFSBuffHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
        CacheStatistics.incrementFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
        CacheStatistics.incrementHDFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
        CacheStatistics.incrementFSBuffWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
        CacheStatistics.incrementFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
        CacheStatistics.incrementHDFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
        CacheStatistics.incrementAcquireRTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
        CacheStatistics.incrementAcquireMTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
        CacheStatistics.incrementReleaseTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
        CacheStatistics.incrementExportTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
      }

      // read all files of result variables and prepare for return
      LocalVariableMap[] results = readResultFile(job, resultFile);

      ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
      throw new DMLRuntimeException(ex);
    } finally {
      // remove created files
      try {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
      } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
      }
    }

    if (DMLScript.STATISTICS) {
      long t1 = System.nanoTime();
      Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
  }
  /**
   * Starts a Rand MapReduce job which will produce one or more random objects.
   *
   * @param numRows number of rows for each random object
   * @param numCols number of columns for each random object
   * @param blockRowSize number of rows in a block for each random object
   * @param blockColSize number of columns in a block for each random object
   * @param minValue minimum of the random values for each random object
   * @param maxValue maximum of the random values for each random object
   * @param sparsity sparsity for each random object
   * @param pdf probability density function for each random object
   * @param replication file replication
   * @param inputs input file for each random object
   * @param outputs output file for each random object
   * @param outputInfos output information for each random object
   * @param instructionsInMapper instruction for each random object
   * @param resultIndexes result indexes for each random object
   * @return matrix characteristics for each random object
   * @throws Exception if an error occurred in the MapReduce phase
   */
  public static JobReturn runJob(
      MRJobInstruction inst,
      String[] dataGenInstructions,
      String instructionsInMapper,
      String aggInstructionsInReducer,
      String otherInstructionsInReducer,
      int numReducers,
      int replication,
      byte[] resultIndexes,
      String dimsUnknownFilePrefix,
      String[] outputs,
      OutputInfo[] outputInfos)
      throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
      dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

      MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
      MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
      DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

      rlens[i] = genInst.getRows();
      clens[i] = genInst.getCols();
      brlens[i] = genInst.getRowsInBlock();
      bclens[i] = genInst.getColsInBlock();

      maxbrlen = Math.max(maxbrlen, brlens[i]);
      maxbclen = Math.max(maxbclen, bclens[i]);

      if (mrtype == MRINSTRUCTION_TYPE.Rand) {
        RandInstruction randInst = (RandInstruction) mrins;
        inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
        maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);

        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();

        // seed generation
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
        long[] nnz =
            LibMatrixDatagen.computeNNZperBlock(
                rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
        int nnzIx = 0;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
          for (long c = 0; c < clens[i]; c += bclens[i]) {
            long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

            sb.append((r / brlens[i]) + 1);
            sb.append(',');
            sb.append((c / bclens[i]) + 1);
            sb.append(',');
            sb.append(curBlockRowSize);
            sb.append(',');
            sb.append(curBlockColSize);
            sb.append(',');
            sb.append(nnz[nnzIx++]);
            sb.append(',');
            sb.append(bigrand.nextLong());
            pw.println(sb.toString());
            sb.setLength(0);
            numblocks++;
          }
        }
        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
        SeqInstruction seqInst = (SeqInstruction) mrins;
        inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
        maxsparsity = 1.0; // always dense

        double from = seqInst.fromValue;
        double to = seqInst.toValue;
        double incr = seqInst.incrValue;

        // handle default 1 to -1 for special case of from>to
        incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

        // Correctness checks on (from, to, incr)
        boolean neg = (from > to);
        if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

        if (neg != (incr < 0))
          throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

        // Compute the number of rows in the sequence
        long numrows = 1 + (long) Math.floor((to - from) / incr);
        if (rlens[i] > 0) {
          if (numrows != rlens[i])
            throw new DMLRuntimeException(
                "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                    + rlens[i]
                    + " != "
                    + numrows);
        } else {
          rlens[i] = numrows;
        }

        if (clens[i] > 0 && clens[i] != 1)
          throw new DMLRuntimeException(
              "Unexpected error while processing sequence instruction. Number of columns ("
                  + clens[i]
                  + ") must be equal to 1.");
        else clens[i] = 1;

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);
        StringBuilder sb = new StringBuilder();

        double temp = from;
        double block_from, block_to;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

          // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to]
          // (inclusive of both end points of the interval)
          long bid_i = ((r / brlens[i]) + 1);
          long bid_j = 1;
          block_from = temp;
          block_to = temp + (curBlockRowSize - 1) * incr;
          temp = block_to + incr; // next block starts from here

          sb.append(bid_i);
          sb.append(',');
          sb.append(bid_j);
          sb.append(',');
          /*
          // Need not include block size while generating seq()
          sb.append(curBlockRowSize);
          sb.append(',');
          sb.append(1);
          sb.append(',');*/
          sb.append(block_from);
          sb.append(',');
          sb.append(block_to);
          sb.append(',');
          sb.append(incr);

          pw.println(sb.toString());
          // System.out.println("MapTask " + r + ": " + sb.toString());
          sb.setLength(0);
          numblocks++;
        }

        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else {
        throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
      }
    }
    dataGenInsStr = dataGenInsStr.substring(1); // remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the input files and their format information
      MRJobConfiguration.setUpMultipleInputs(
          job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);

      // set up the dimensions of input matrices
      MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
      MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the rand Instructions
      MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

      // set up unary instructions that will perform in the mapper
      MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

      // set up the aggregate instructions that will happen in the combiner and reducer
      MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

      // set up the instructions that will happen in the reducer, after the aggregation instrucions
      MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

      // set up the replication factor for the results
      job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getDMLConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // set up custom map/reduce configurations
      MRJobConfiguration.setupCustomMRConfigurations(job, config);

      // determine degree of parallelism (nmappers: 1<=n<=capacity)
      // TODO use maxsparsity whenever we have a way of generating sparse rand data
      int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
      long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
      // correction max number of mappers on yarn clusters
      if (InfrastructureAnalyzer.isYarnEnabled())
        capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
      int nmapers =
          Math.max(
              Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity),
              1);
      job.setNumMapTasks(nmapers);

      // set up what matrices are needed to pass from the mapper to reducer
      HashSet<Byte> mapoutputIndexes =
          MRJobConfiguration.setUpOutputIndexesForMapper(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              otherInstructionsInReducer,
              resultIndexes);

      MatrixChar_N_ReducerGroups ret =
          MRJobConfiguration.computeMatrixCharacteristics(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              null,
              otherInstructionsInReducer,
              resultIndexes,
              mapoutputIndexes,
              false);
      stats = ret.stats;

      // set up the number of reducers
      MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

      // print the complete MRJob instruction
      if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats);

      // Update resultDimsUnknown based on computed "stats"
      byte[] resultDimsUnknown = new byte[resultIndexes.length];
      for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
          resultDimsUnknown[i] = (byte) 1;
        } else {
          resultDimsUnknown[i] = (byte) 0;
        }
      }

      boolean mayContainCtable =
          instructionsInMapper.contains("ctabletransform")
              || instructionsInMapper.contains("groupedagg");

      // set up the multiple output files, and their format information
      MRJobConfiguration.setUpMultipleOutputs(
          job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);

      // configure mapper and the mapper output key value pairs
      job.setMapperClass(DataGenMapper.class);
      if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
      } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
      }

      // set up combiner
      if (numReducers != 0
          && aggInstructionsInReducer != null
          && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class);

      // configure reducer
      job.setReducerClass(GMRReducer.class);
      // job.setReducerClass(PassThroughReducer.class);

      // By default, the job executes in "cluster" mode.
      // Determine if we can optimize and run it in "local" mode.
      MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
      for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
      }

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      runjob = JobClient.runJob(job);

      /* Process different counters */

      Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
      for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
      }

      String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
      stats = MapReduceTool.processDimsFiles(dir, stats);
      MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
      for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
  }