// output the records in the outCache. public void close() throws IOException { long start = System.currentTimeMillis(); Iterator<Entry<MatrixIndexes, MatrixValue>> it = outCache.entrySet().iterator(); while (it.hasNext()) { Entry<MatrixIndexes, MatrixValue> entry = it.next(); realWriteToCollector(entry.getKey(), entry.getValue()); } // handle empty block output (on first reduce task only) if (outputDummyRecords) // required for rejecting empty blocks in mappers { long rlen = dim1.getRows(); long clen = dim2.getCols(); int brlen = dim1.getRowsPerBlock(); int bclen = dim2.getColsPerBlock(); MatrixIndexes tmpIx = new MatrixIndexes(); MatrixBlock tmpVal = new MatrixBlock(); for (long i = 0, r = 1; i < rlen; i += brlen, r++) for (long j = 0, c = 1; j < clen; j += bclen, c++) { int realBrlen = (int) Math.min((long) brlen, rlen - (r - 1) * brlen); int realBclen = (int) Math.min((long) bclen, clen - (c - 1) * bclen); tmpIx.setIndexes(r, c); tmpVal.reset(realBrlen, realBclen); collectFinalMultipleOutputs.collectOutput(tmpIx, tmpVal, 0, cachedReporter); } } if (cachedReporter != null) cachedReporter.incrCounter( Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start); super.close(); }
/** * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } }
@Override protected Tuple2<MatrixIndexes, MatrixBlock> computeNext(Tuple2<MatrixIndexes, MatrixBlock> arg) throws Exception { // unpack partition key-value pairs MatrixIndexes ix = arg._1(); MatrixBlock in1 = arg._2(); // get the rhs block int rix = (int) ((_vtype == VectorType.COL_VECTOR) ? ix.getRowIndex() : 1); int cix = (int) ((_vtype == VectorType.COL_VECTOR) ? 1 : ix.getColumnIndex()); MatrixBlock in2 = _pmV.getMatrixBlock(rix, cix); // execute the binary operation MatrixBlock ret = (MatrixBlock) (in1.binaryOperations(_op, in2, new MatrixBlock())); return new Tuple2<MatrixIndexes, MatrixBlock>(ix, ret); }
@Override public MatrixBlock readMatrixFromHDFS( String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { // allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, false, false); // prepare file access JobConf job = new JobConf(); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path((_localFS ? "file:///" : "") + fname); // check existence and non-empty file checkValidInputFile(fs, path); // core read readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); // finally check if change of sparse/dense block representation required ret.recomputeNonZeros(); ret.examSparsity(); return ret; }
/** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix * partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { // note: next(key, value) does not yet exploit the given serialization classes, record // reader does but is generally slower. while (reader.next(key, value)) { // empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result if (sparse) { dest.appendToSparse(value, row_offset, col_offset); // note: append requires final sort } else { dest.copy( row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } } finally { IOUtilFunctions.closeSilently(reader); } } if (sparse && clen > bclen) { // no need to sort if 1 column block since always sorted dest.sortSparseRows(); } }
/** * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException */ protected void writeTextCellMatrixToHDFS( Path path, JobConf job, MatrixBlock src, long rlen, long clen) throws IOException { boolean sparse = src.isInSparseFormat(); boolean entriesWritten = false; FileSystem fs = FileSystem.get(job); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); int rows = src.getNumRows(); int cols = src.getNumColumns(); // bound check per block if (rows > rlen || cols > clen) { throw new IOException( "Matrix block [1:" + rows + ",1:" + cols + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } try { // for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); if (sparse) // SPARSE { SparseRowsIterator iter = src.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); sb.append(cell.i + 1); sb.append(' '); sb.append(cell.j + 1); sb.append(' '); sb.append(cell.v); sb.append('\n'); br.write(sb.toString()); // same as append sb.setLength(0); entriesWritten = true; } } else // DENSE { for (int i = 0; i < rows; i++) { String rowIndex = Integer.toString(i + 1); for (int j = 0; j < cols; j++) { double lvalue = src.getValueDenseUnsafe(i, j); if (lvalue != 0) // for nnz { sb.append(rowIndex); sb.append(' '); sb.append(j + 1); sb.append(' '); sb.append(lvalue); sb.append('\n'); br.write(sb.toString()); // same as append sb.setLength(0); entriesWritten = true; } } } } // handle empty result if (!entriesWritten) { br.write("1 1 0\n"); } } finally { IOUtilFunctions.closeSilently(br); } }