/** * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } }
public void configure(JobConf job) { super.configure(job); try { comb_instructions = MRJobConfiguration.getCombineInstruction(job); } catch (Exception e) { throw new RuntimeException(e); } for (int i = 0; i < resultIndexes.length; i++) { MatrixCharacteristics stat = MRJobConfiguration.getMatrixCharacteristicsForOutput(job, resultIndexes[i]); outputBlockSizes.put( resultIndexes[i], new Pair<Integer, Integer>(stat.getRowsPerBlock(), stat.getColsPerBlock())); } for (MRInstruction ins : comb_instructions) { outputIndexesMapping.put(ins.output, getOutputIndexes(ins.output)); } }
public static JobReturn runJob( MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job; job = new JobConf(CombineMR.class); job.setJobName("Standalone-MR"); boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos); // whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation); byte[] inputIndexes = new byte[inputs.length]; for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b; // set up the input files and their format information MRJobConfiguration.setUpMultipleInputs( job, inputIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL); // set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens); // set up the block size MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens); // set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, ""); // set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, ""); // set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, ""); MRJobConfiguration.setCombineInstructions(job, combineInstructions); // set up the replication factor for the results job.setInt("dfs.replication", replication); // set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper( job, inputIndexes, null, null, combineInstructions, resultIndexes); // set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs( job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation); // configure mapper and the mapper output key value pairs job.setMapperClass(GMRMapper.class); job.setMapOutputKeyClass(MatrixIndexes.class); if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class); else job.setMapOutputValueClass(TaggedMatrixCell.class); // configure reducer job.setReducerClass(InnerReducer.class); // job.setReducerClass(PassThroughReducer.class); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics( job, inputIndexes, null, null, null, combineInstructions, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; // set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } // set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); RunningJob runjob = JobClient.runJob(job); return new JobReturn(stats, runjob.isSuccessful()); }
/** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix * partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { // note: next(key, value) does not yet exploit the given serialization classes, record // reader does but is generally slower. while (reader.next(key, value)) { // empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result if (sparse) { dest.appendToSparse(value, row_offset, col_offset); // note: append requires final sort } else { dest.copy( row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } } finally { IOUtilFunctions.closeSilently(reader); } } if (sparse && clen > bclen) { // no need to sort if 1 column block since always sorted dest.sortSparseRows(); } }