/** * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } }
/** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix * partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { // note: next(key, value) does not yet exploit the given serialization classes, record // reader does but is generally slower. while (reader.next(key, value)) { // empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result if (sparse) { dest.appendToSparse(value, row_offset, col_offset); // note: append requires final sort } else { dest.copy( row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } } finally { IOUtilFunctions.closeSilently(reader); } } if (sparse && clen > bclen) { // no need to sort if 1 column block since always sorted dest.sortSparseRows(); } }
/** * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException */ protected void writeTextCellMatrixToHDFS( Path path, JobConf job, MatrixBlock src, long rlen, long clen) throws IOException { boolean sparse = src.isInSparseFormat(); boolean entriesWritten = false; FileSystem fs = FileSystem.get(job); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); int rows = src.getNumRows(); int cols = src.getNumColumns(); // bound check per block if (rows > rlen || cols > clen) { throw new IOException( "Matrix block [1:" + rows + ",1:" + cols + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } try { // for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); if (sparse) // SPARSE { SparseRowsIterator iter = src.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); sb.append(cell.i + 1); sb.append(' '); sb.append(cell.j + 1); sb.append(' '); sb.append(cell.v); sb.append('\n'); br.write(sb.toString()); // same as append sb.setLength(0); entriesWritten = true; } } else // DENSE { for (int i = 0; i < rows; i++) { String rowIndex = Integer.toString(i + 1); for (int j = 0; j < cols; j++) { double lvalue = src.getValueDenseUnsafe(i, j); if (lvalue != 0) // for nnz { sb.append(rowIndex); sb.append(' '); sb.append(j + 1); sb.append(' '); sb.append(lvalue); sb.append('\n'); br.write(sb.toString()); // same as append sb.setLength(0); entriesWritten = true; } } } } // handle empty result if (!entriesWritten) { br.write("1 1 0\n"); } } finally { IOUtilFunctions.closeSilently(br); } }