@Override public final void writeMatrixToHDFS( MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz) throws IOException, DMLRuntimeException { // validity check matrix dimensions if (src.getNumRows() != rlen || src.getNumColumns() != clen) { throw new IOException( "Matrix dimensions mismatch with metadata: " + src.getNumRows() + "x" + src.getNumColumns() + " vs " + rlen + "x" + clen + "."); } // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); // if the file already exists on HDFS, remove it. MapReduceTool.deleteFileIfExistOnHDFS(fname); // core write (sequential/parallel) writeCSVMatrixToHDFS(path, job, fs, src, _props); IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path); }
@Override public MatrixBlock readMatrixFromHDFS( String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { // allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, (int) rlen, (int) clen, estnnz, true, false); // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); // check existence and non-empty file checkValidInputFile(fs, path); // core read readBinaryCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); // finally check if change of sparse/dense block representation required // (nnz maintained via append during read for both dense/sparse) ret.examSparsity(); return ret; }
@Override protected ExecType optFindExecType() throws HopsException { checkAndSetForcedPlatform(); ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR; if (_etypeForced != null) { _etype = _etypeForced; } else { if (OptimizerUtils.isMemoryBasedOptLevel()) { _etype = findExecTypeByMemEstimate(); } // Choose CP, if the input dimensions are below threshold or if the input is a vector // Also, matrix inverse is currently implemented only in CP (through commons math) else if (getInput().get(0).areDimsBelowThreshold() || getInput().get(0).isVector() || isInMemoryOperation()) { _etype = ExecType.CP; } else { _etype = REMOTE; } // check for valid CP dimensions and matrix size checkAndSetInvalidCPDimsAndSize(); } // spark-specific decision refinement (execute unary w/ spark input and // single parent also in spark because it's likely cheap and reduces intermediates) if (_etype == ExecType.CP && _etypeForced != ExecType.CP && getInput().get(0).optFindExecType() == ExecType.SPARK && getDataType().isMatrix() && !isCumulativeUnaryOperation() && !isCastUnaryOperation() && _op != OpOp1.MEDIAN && _op != OpOp1.IQM && !(getInput().get(0) instanceof DataOp) // input is not checkpoint && getInput().get(0).getParent().size() == 1) // unary is only parent { // pull unary operation into spark _etype = ExecType.SPARK; } // mark for recompile (forever) if (ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype == REMOTE) setRequiresRecompile(); // ensure cp exec type for single-node operations if (_op == OpOp1.PRINT || _op == OpOp1.STOP || _op == OpOp1.INVERSE || _op == OpOp1.EIGEN || _op == OpOp1.CHOLESKY) { _etype = ExecType.CP; } return _etype; }
@Override public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); MatrixBlock src = new MatrixBlock((int) rlen, 1, true); writeCSVMatrixToHDFS(path, job, fs, src, _props); IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path); }
/** * @param vector * @param singleColBlock * @param dense * @param unknownDims */ private void testDataFrameConversion( ValueType[] schema, boolean containsID, boolean dense, boolean unknownDims) { boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform; SparkExecutionContext sec = null; try { DMLScript.USE_LOCAL_SPARK_CONFIG = true; DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; // generate input data and setup metadata int cols = schema.length + colsVector - 1; double sparsity = dense ? sparsity1 : sparsity2; double[][] A = TestUtils.round(getRandomMatrix(rows1, cols, -10, 1000, sparsity, 2373)); MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); int blksz = ConfigurationManager.getBlocksize(); MatrixCharacteristics mc1 = new MatrixCharacteristics(rows1, cols, blksz, blksz, mbA.getNonZeros()); MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1); // setup spark context sec = (SparkExecutionContext) ExecutionContextFactory.createContext(); JavaSparkContext sc = sec.getSparkContext(); SQLContext sqlctx = new SQLContext(sc); // create input data frame DataFrame df = createDataFrame(sqlctx, mbA, containsID, schema); // dataframe - frame conversion JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, containsID); // get output frame block FrameBlock fbB = SparkExecutionContext.toFrameBlock( out, UtilFunctions.nCopies(cols, ValueType.DOUBLE), rows1, cols); // compare frame blocks MatrixBlock mbB = DataConverter.convertToMatrixBlock(fbB); double[][] B = DataConverter.convertToDoubleMatrix(mbB); TestUtils.compareMatrices(A, B, rows1, cols, eps); } catch (Exception ex) { throw new RuntimeException(ex); } finally { sec.close(); DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig; DMLScript.rtplatform = oldPlatform; } }
private static void writeCleanupTasksToFile(Path path, int numTasks) throws DMLRuntimeException, IOException { BufferedWriter br = null; try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); for (int i = 1; i <= numTasks; i++) br.write(String.valueOf("CLEANUP TASK " + i) + "\n"); } catch (Exception ex) { throw new DMLRuntimeException( "Error writing cleanup tasks to taskfile " + path.toString(), ex); } finally { if (br != null) br.close(); } }
@Override protected ExecType optFindExecType() throws HopsException { checkAndSetForcedPlatform(); ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR; // forced / memory-based / threshold-based decision if (_etypeForced != null) { _etype = _etypeForced; } else { if (OptimizerUtils.isMemoryBasedOptLevel()) { _etype = findExecTypeByMemEstimate(); } // Choose CP, if the input dimensions are below threshold or if the input is a vector else if (getInput().get(0).areDimsBelowThreshold() || getInput().get(0).isVector()) { _etype = ExecType.CP; } else { _etype = REMOTE; } // check for valid CP dimensions and matrix size checkAndSetInvalidCPDimsAndSize(); } // spark-specific decision refinement (execute unary aggregate w/ spark input and // single parent also in spark because it's likely cheap and reduces data transfer) if (_etype == ExecType.CP && _etypeForced != ExecType.CP && !(getInput().get(0) instanceof DataOp) // input is not checkpoint && getInput().get(0).getParent().size() == 1 // uagg is only parent && getInput().get(0).optFindExecType() == ExecType.SPARK) { // pull unary aggregate into spark _etype = ExecType.SPARK; } // mark for recompile (forever) if (ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype == REMOTE) { setRequiresRecompile(); } return _etype; }
/** * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ @SuppressWarnings("unchecked") public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path destFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (!_props.hasHeader()) { // simply move srcFile to destFile /* * TODO: Remove this roundabout way! * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv * & the only path that exists already on HDFS is /user/biadmin/csv/. * In this case: the directory structure /user/biadmin/csv/temp/out must be created. * Simple hdfs.rename() does not seem to create this directory structure. */ // delete the destination file, if exists already // boolean ret1 = hdfs.delete(destFilePath, true); // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created. // boolean ret2 = hdfs.createNewFile(destFilePath); // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/ // boolean ret3 = hdfs.delete(destFilePath, true); // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv // boolean ret4 = hdfs.rename(srcFilePath, destFilePath); // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 // + ", rename:" + ret4); return; } // construct the header line StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(_props.getDelim()); } sb.append('\n'); if (hdfs.isDirectory(srcFilePath)) { // compute sorted order among part files ArrayList<Path> files = new ArrayList<Path>(); for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); // first part file path Path firstpart = files.get(0); // create a temp file, and add header and contents of first part Path tmp = new Path(firstpart.toString() + ".tmp"); OutputStream out = hdfs.create(tmp, true); out.write(sb.toString().getBytes()); sb.setLength(0); // copy rest of the data from firstpart InputStream in = null; try { in = hdfs.open(firstpart); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } // rename tmp to firstpart hdfs.delete(firstpart, true); hdfs.rename(tmp, firstpart); // rename srcfile to destFile hdfs.delete(destFilePath, true); hdfs.createNewFile(destFilePath); // force the creation of directory structure hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure hdfs.rename(srcFilePath, destFilePath); // move the data } else if (hdfs.isFile(srcFilePath)) { // create destination file OutputStream out = hdfs.create(destFilePath, true); // write header out.write(sb.toString().getBytes()); sb.setLength(0); // copy the data from srcFile InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
/** * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files * are created by CSV_WRITE MR job. * * <p>This method is invoked from CP-write instruction. * * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ public final void mergeCSVPartFiles( String srcFileName, String destFileName, CSVFileFormatProperties csvprop, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path mergedFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(mergedFilePath)) { hdfs.delete(mergedFilePath, true); } OutputStream out = hdfs.create(mergedFilePath, true); // write out the header, if needed if (csvprop.hasHeader()) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(csvprop.getDelim()); } sb.append('\n'); out.write(sb.toString().getBytes()); sb.setLength(0); } // if the source is a directory if (hdfs.isDirectory(srcFilePath)) { try { FileStatus[] contents = hdfs.listStatus(srcFilePath); Path[] partPaths = new Path[contents.length]; int numPartFiles = 0; for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { partPaths[i] = contents[i].getPath(); numPartFiles++; } } Arrays.sort(partPaths); for (int i = 0; i < numPartFiles; i++) { InputStream in = hdfs.open(partPaths[i]); try { IOUtils.copyBytes(in, out, conf, false); if (i < numPartFiles - 1) out.write('\n'); } finally { IOUtilFunctions.closeSilently(in); } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(srcFilePath)) { InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
/** * @param pfid * @param program * @param taskFile * @param resultFile * @param _enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob( long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, // inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) // opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); // maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// // configure the MR job // set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); // enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); // set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); // map-only // set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else // default case { job.setInputFormat(NLineInputFormat.class); } // set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); // set output format job.setOutputFormat(SequenceFileOutputFormat.class); // set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); // set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// // set optimization parameters // set the number of mappers and reducers job.setNumMapTasks(numMappers); // numMappers job.setNumReduceTasks(0); // job.setInt("mapred.map.tasks.maximum", 1); //system property // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property // use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.priority", 0); // highest job.setInt("flex.map.min", 0); job.setInt("flex.map.max", numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", numMappers); } // set jvm memory size (if require) String memKey = "mapred.child.java.opts"; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } // disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); // set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); // enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB // set the replication factor for the results job.setInt("dfs.replication", replication); // set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 // job.setInt("mapreduce.map.maxattempts", max_retry); // set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics.incrementMemHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics.incrementFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics.incrementAcquireRTime( cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics.incrementAcquireMTime( cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics.incrementReleaseTime( cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics.incrementExportTime( cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
/** * Starts a Rand MapReduce job which will produce one or more random objects. * * @param numRows number of rows for each random object * @param numCols number of columns for each random object * @param blockRowSize number of rows in a block for each random object * @param blockColSize number of columns in a block for each random object * @param minValue minimum of the random values for each random object * @param maxValue maximum of the random values for each random object * @param sparsity sparsity for each random object * @param pdf probability density function for each random object * @param replication file replication * @param inputs input file for each random object * @param outputs output file for each random object * @param outputInfos output information for each random object * @param instructionsInMapper instruction for each random object * @param resultIndexes result indexes for each random object * @return matrix characteristics for each random object * @throws Exception if an error occurred in the MapReduce phase */ public static JobReturn runJob( MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); // whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir()); maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); // for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); // seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); long[] nnz = LibMatrixDatagen.computeNNZperBlock( rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); int nnzIx = 0; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnz[nnzIx++]); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; // always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; // handle default 1 to -1 for special case of from>to incr = LibMatrixDatagen.updateSeqIncr(from, to, incr); // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = 1 + (long) Math.floor((to - from) / incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] // (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); /* // Need not include block size while generating seq() sb.append(curBlockRowSize); sb.append(','); sb.append(1); sb.append(',');*/ sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); // System.out.println("MapTask " + r + ": " + sb.toString()); sb.setLength(0); numblocks++; } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1); // remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { // set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); // set up the input files and their format information MRJobConfiguration.setUpMultipleInputs( job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); // set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); // set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); // set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); // set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); // set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); // set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); // set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); // set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); // set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); // determine degree of parallelism (nmappers: 1<=n<=capacity) // TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); // correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math.max( Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); // set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper( job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics( job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; // set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); // set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs( job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } // set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); // configure reducer job.setReducerClass(GMRReducer.class); // job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } // set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }