/** * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } }
// output the records in the outCache. public void close() throws IOException { long start = System.currentTimeMillis(); Iterator<Entry<MatrixIndexes, MatrixValue>> it = outCache.entrySet().iterator(); while (it.hasNext()) { Entry<MatrixIndexes, MatrixValue> entry = it.next(); realWriteToCollector(entry.getKey(), entry.getValue()); } // handle empty block output (on first reduce task only) if (outputDummyRecords) // required for rejecting empty blocks in mappers { long rlen = dim1.getRows(); long clen = dim2.getCols(); int brlen = dim1.getRowsPerBlock(); int bclen = dim2.getColsPerBlock(); MatrixIndexes tmpIx = new MatrixIndexes(); MatrixBlock tmpVal = new MatrixBlock(); for (long i = 0, r = 1; i < rlen; i += brlen, r++) for (long j = 0, c = 1; j < clen; j += bclen, c++) { int realBrlen = (int) Math.min((long) brlen, rlen - (r - 1) * brlen); int realBclen = (int) Math.min((long) bclen, clen - (c - 1) * bclen); tmpIx.setIndexes(r, c); tmpVal.reset(realBrlen, realBclen); collectFinalMultipleOutputs.collectOutput(tmpIx, tmpVal, 0, cachedReporter); } } if (cachedReporter != null) cachedReporter.incrCounter( Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start); super.close(); }
private void processBinaryCombineInstruction(CombineBinaryInstruction ins, Reporter reporter) throws IOException { IndexedMatrixValue in1 = cachedValues.getFirst(ins.input1); IndexedMatrixValue in2 = cachedValues.getFirst(ins.input2); if (in1 == null && in2 == null) return; MatrixIndexes indexes; if (in1 != null) indexes = in1.getIndexes(); else indexes = in2.getIndexes(); // if one of the inputs is null, then it is a all zero block if (in1 == null) { in1 = zeroInput; in1.getValue().reset(in2.getValue().getNumRows(), in2.getValue().getNumColumns()); } if (in2 == null) { in2 = zeroInput; in2.getValue().reset(in1.getValue().getNumRows(), in1.getValue().getNumColumns()); } // System.out.println("in1:"+in1); // System.out.println("in2:"+in2); // process instruction try { /*in1.getValue().combineOperations(in2.getValue(), collectFinalMultipleOutputs, reporter, keyBuff, valueBuff, getOutputIndexes(ins.output));*/ ArrayList<Integer> outputIndexes = outputIndexesMapping.get(ins.output); for (int r = 0; r < in1.getValue().getNumRows(); r++) for (int c = 0; c < in1.getValue().getNumColumns(); c++) { Pair<Integer, Integer> blockSize = outputBlockSizes.get(ins.output); keyBuff.setIndexes( UtilFunctions.cellIndexCalculation(indexes.getRowIndex(), blockSize.getKey(), r), UtilFunctions.cellIndexCalculation( indexes.getColumnIndex(), blockSize.getValue(), c)); valueBuff.setValue(in1.getValue().getValue(r, c)); double temp = in2.getValue().getValue(r, c); if (ins.isSecondInputWeight()) { valueBuff.setWeight(temp); valueBuff.setOtherValue(0); } else { valueBuff.setWeight(1); valueBuff.setOtherValue(temp); } for (int i : outputIndexes) { collectFinalMultipleOutputs.collectOutput(keyBuff, valueBuff, i, reporter); // System.out.println("output: "+keyBuff+" -- "+valueBuff); } } } catch (Exception e) { throw new RuntimeException(e); } }
@Override protected Tuple2<MatrixIndexes, MatrixBlock> computeNext(Tuple2<MatrixIndexes, MatrixBlock> arg) throws Exception { // unpack partition key-value pairs MatrixIndexes ix = arg._1(); MatrixBlock in1 = arg._2(); // get the rhs block int rix = (int) ((_vtype == VectorType.COL_VECTOR) ? ix.getRowIndex() : 1); int cix = (int) ((_vtype == VectorType.COL_VECTOR) ? 1 : ix.getColumnIndex()); MatrixBlock in2 = _pmV.getMatrixBlock(rix, cix); // execute the binary operation MatrixBlock ret = (MatrixBlock) (in1.binaryOperations(_op, in2, new MatrixBlock())); return new Tuple2<MatrixIndexes, MatrixBlock>(ix, ret); }
private void processTernaryCombineInstruction(CombineTernaryInstruction ins, Reporter reporter) throws IOException { IndexedMatrixValue in1 = cachedValues.getFirst(ins.input1); IndexedMatrixValue in2 = cachedValues.getFirst(ins.input2); IndexedMatrixValue in3 = cachedValues.getFirst(ins.input3); if (in1 == null && in2 == null && in3 == null) return; int nr = 0, nc = 0; if (in1 != null) { nr = in1.getValue().getNumRows(); nc = in1.getValue().getNumColumns(); } else if (in2 != null) { nr = in2.getValue().getNumRows(); nc = in2.getValue().getNumColumns(); } else { nr = in3.getValue().getNumRows(); nc = in3.getValue().getNumColumns(); } // if one of the inputs is null, then it is a all zero block if (in1 == null) { in1 = zeroInput; in1.getValue().reset(nr, nc); } if (in2 == null) { in2 = zeroInput; in2.getValue().reset(nr, nc); } if (in3 == null) { in3 = zeroInput; in3.getValue().reset(nr, nc); } // process instruction try { ArrayList<Integer> outputIndexes = outputIndexesMapping.get(ins.output); for (int r = 0; r < nr; r++) for (int c = 0; c < nc; c++) { Pair<Integer, Integer> blockSize = outputBlockSizes.get(ins.output); keyBuff.setIndexes( UtilFunctions.cellIndexCalculation( in1.getIndexes().getRowIndex(), blockSize.getKey(), r), UtilFunctions.cellIndexCalculation( in1.getIndexes().getColumnIndex(), blockSize.getValue(), c)); valueBuff.setValue(in1.getValue().getValue(r, c)); valueBuff.setOtherValue(in2.getValue().getValue(r, c)); valueBuff.setWeight(in3.getValue().getValue(r, c)); for (int i : outputIndexes) { collectFinalMultipleOutputs.collectOutput(keyBuff, valueBuff, i, reporter); // System.out.println("output: "+keyBuff+" -- "+valueBuff); } } } catch (Exception e) { throw new RuntimeException(e); } }
private void processJoin(int tag, RemainIndexValue rValue) throws Exception { // for the cached matrix if (tag == 0) { addToCache(rValue, tag); // LOG.info("put in the buffer for left matrix"); // LOG.info(rblock.block.toString()); } else // for the probing matrix { // LOG.info("process join with block size: "+rValue.value.getNumRows()+" X // "+rValue.value.getNumColumns()+" nonZeros: "+rValue.value.getNonZeros()); for (int i = 0; i < cacheSize; i++) { RemainIndexValue left, right; if (tagForLeft == 0) { left = cache.get(i); right = rValue; } else { right = cache.get(i); left = rValue; } indexesbuffer.setIndexes(left.remainIndex, right.remainIndex); try { OperationsOnMatrixValues.performAggregateBinaryIgnoreIndexes( left.value, right.value, valueBuffer, (AggregateBinaryOperator) aggBinInstruction.getOperator()); } catch (DMLUnsupportedOperationException e) { throw new IOException(e); } // if(valueBuffer.getNonZeros()>0) collectOutput(indexesbuffer, valueBuffer); } } }
@Override public void execute(MatrixIndexes in, MatrixIndexes out) { // only used for V2M out.setIndexes(in.getRowIndex(), in.getRowIndex()); }
/** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix * partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { // note: next(key, value) does not yet exploit the given serialization classes, record // reader does but is generally slower. while (reader.next(key, value)) { // empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result if (sparse) { dest.appendToSparse(value, row_offset, col_offset); // note: append requires final sort } else { dest.copy( row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } } finally { IOUtilFunctions.closeSilently(reader); } } if (sparse && clen > bclen) { // no need to sort if 1 column block since always sorted dest.sortSparseRows(); } }
@Override public void execute(MatrixIndexes in, MatrixIndexes out) { out.setIndexes(1, 1); }