public static MRInstruction parseSingleInstruction(MRINSTRUCTION_TYPE mrtype, String str) throws DMLUnsupportedOperationException, DMLRuntimeException { if (str == null || str.isEmpty()) return null; switch (mrtype) { case Aggregate: return AggregateInstruction.parseInstruction(str); case ArithmeticBinary: { String opcode = InstructionUtils.getOpCode(str); String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); // extract datatypes of first and second input operands String dt1 = parts[1].split(Instruction.DATATYPE_PREFIX)[1].split(Instruction.VALUETYPE_PREFIX)[0]; String dt2 = parts[2].split(Instruction.DATATYPE_PREFIX)[1].split(Instruction.VALUETYPE_PREFIX)[0]; if (dt1.equalsIgnoreCase("SCALAR") || dt2.equalsIgnoreCase("SCALAR")) { return ScalarInstruction.parseInstruction(str); } else { if (BinaryM.isOpcode(opcode)) return BinaryMInstruction.parseInstruction(str); else return BinaryInstruction.parseInstruction(str); } } case AggregateBinary: return AggregateBinaryInstruction.parseInstruction(str); case AggregateUnary: return AggregateUnaryInstruction.parseInstruction(str); case Ternary: return TernaryInstruction.parseInstruction(str); case Quaternary: return QuaternaryInstruction.parseInstruction(str); case Rand: return RandInstruction.parseInstruction(str); case Seq: return SeqInstruction.parseInstruction(str); case Reblock: return ReblockInstruction.parseInstruction(str); case Append: return AppendInstruction.parseInstruction(str); case Reorg: return ReorgInstruction.parseInstruction(str); case Replicate: return ReplicateInstruction.parseInstruction(str); case Unary: { String opcode = InstructionUtils.getOpCode(str); String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); if (parts.length == 4 && (opcode.equalsIgnoreCase("log") || opcode.equalsIgnoreCase("log_nz"))) return ScalarInstruction.parseInstruction(str); else // default case return UnaryInstruction.parseInstruction(str); } case MMTSJ: return MMTSJMRInstruction.parseInstruction(str); case PMMJ: return PMMJMRInstruction.parseInstruction(str); case MapMultChain: return MapMultChainInstruction.parseInstruction(str); case BinUaggChain: return BinUaggChainInstruction.parseInstruction(str); case UaggOuterChain: return UaggOuterChainInstruction.parseInstruction(str); case CombineTernary: return CombineTernaryInstruction.parseInstruction(str); case CombineBinary: return CombineBinaryInstruction.parseInstruction(str); case CombineUnary: return CombineUnaryInstruction.parseInstruction(str); case PickByCount: return PickByCountInstruction.parseInstruction(str); case CM_N_COV: return CM_N_COVInstruction.parseInstruction(str); case GroupedAggregate: return GroupedAggregateInstruction.parseInstruction(str); case MapGroupedAggregate: return GroupedAggregateMInstruction.parseInstruction(str); case RangeReIndex: return RangeBasedReIndexInstruction.parseInstruction(str); case ZeroOut: return ZeroOutInstruction.parseInstruction(str); case MatrixReshape: return MatrixReshapeMRInstruction.parseInstruction(str); case Sort: // workaround for dummy MR sort instruction return SortMR.parseSortInstruction(str); case CSVReblock: return CSVReblockInstruction.parseInstruction(str); case CSVWrite: return CSVWriteInstruction.parseInstruction(str); case ParameterizedBuiltin: return ParameterizedBuiltinMRInstruction.parseInstruction(str); case RemoveEmpty: return RemoveEmptyMRInstruction.parseInstruction(str); case Partition: return DataPartitionMRInstruction.parseInstruction(str); case CumsumAggregate: return CumulativeAggregateInstruction.parseInstruction(str); case CumsumSplit: return CumulativeSplitInstruction.parseInstruction(str); case CumsumOffset: return CumulativeOffsetInstruction.parseInstruction(str); case INVALID: default: throw new DMLRuntimeException("Invalid MR Instruction Type: " + mrtype); } }
/** * Starts a Rand MapReduce job which will produce one or more random objects. * * @param numRows number of rows for each random object * @param numCols number of columns for each random object * @param blockRowSize number of rows in a block for each random object * @param blockColSize number of columns in a block for each random object * @param minValue minimum of the random values for each random object * @param maxValue maximum of the random values for each random object * @param sparsity sparsity for each random object * @param pdf probability density function for each random object * @param replication file replication * @param inputs input file for each random object * @param outputs output file for each random object * @param outputInfos output information for each random object * @param instructionsInMapper instruction for each random object * @param resultIndexes result indexes for each random object * @return matrix characteristics for each random object * @throws Exception if an error occurred in the MapReduce phase */ public static JobReturn runJob( MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); // whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir()); maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); // for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); // seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); long[] nnz = LibMatrixDatagen.computeNNZperBlock( rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); int nnzIx = 0; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnz[nnzIx++]); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; // always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; // handle default 1 to -1 for special case of from>to incr = LibMatrixDatagen.updateSeqIncr(from, to, incr); // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = 1 + (long) Math.floor((to - from) / incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] // (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); /* // Need not include block size while generating seq() sb.append(curBlockRowSize); sb.append(','); sb.append(1); sb.append(',');*/ sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); // System.out.println("MapTask " + r + ": " + sb.toString()); sb.setLength(0); numblocks++; } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1); // remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { // set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); // set up the input files and their format information MRJobConfiguration.setUpMultipleInputs( job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); // set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); // set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); // set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); // set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); // set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); // set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); // set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); // set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); // set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); // determine degree of parallelism (nmappers: 1<=n<=capacity) // TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); // correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math.max( Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); // set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper( job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics( job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; // set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); // set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs( job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } // set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); // configure reducer job.setReducerClass(GMRReducer.class); // job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } // set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }