Пример #1
0
  /**
   * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat-
   * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the
   * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is
   * advantageous if the actual sequence files are larger than the file splits created by
   * informat.getSplits (which is usually aligned to the HDFS block size) because then there is
   * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read
   * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix
   * partitioning).
   *
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("deprecation")
  private static void readBinaryBlockMatrixFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException, DMLRuntimeException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        // note: next(key, value) does not yet exploit the given serialization classes, record
        // reader does but is generally slower.
        while (reader.next(key, value)) {
          // empty block filter (skip entire block)
          if (value.isEmptyBlock(false)) continue;

          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;

          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          if (sparse) {
            dest.appendToSparse(value, row_offset, col_offset);
            // note: append requires final sort
          } else {
            dest.copy(
                row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }

    if (sparse && clen > bclen) {
      // no need to sort if 1 column block since always sorted
      dest.sortSparseRows();
    }
  }
Пример #2
0
  /**
   * @param path
   * @param job
   * @param src
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   */
  protected void writeTextCellMatrixToHDFS(
      Path path, JobConf job, MatrixBlock src, long rlen, long clen) throws IOException {
    boolean sparse = src.isInSparseFormat();
    boolean entriesWritten = false;
    FileSystem fs = FileSystem.get(job);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

    int rows = src.getNumRows();
    int cols = src.getNumColumns();

    // bound check per block
    if (rows > rlen || cols > clen) {
      throw new IOException(
          "Matrix block [1:"
              + rows
              + ",1:"
              + cols
              + "] "
              + "out of overall matrix range [1:"
              + rlen
              + ",1:"
              + clen
              + "].");
    }

    try {
      // for obj reuse and preventing repeated buffer re-allocations
      StringBuilder sb = new StringBuilder();

      if (sparse) // SPARSE
      {
        SparseRowsIterator iter = src.getSparseRowsIterator();
        while (iter.hasNext()) {
          IJV cell = iter.next();

          sb.append(cell.i + 1);
          sb.append(' ');
          sb.append(cell.j + 1);
          sb.append(' ');
          sb.append(cell.v);
          sb.append('\n');
          br.write(sb.toString()); // same as append
          sb.setLength(0);
          entriesWritten = true;
        }
      } else // DENSE
      {
        for (int i = 0; i < rows; i++) {
          String rowIndex = Integer.toString(i + 1);
          for (int j = 0; j < cols; j++) {
            double lvalue = src.getValueDenseUnsafe(i, j);
            if (lvalue != 0) // for nnz
            {
              sb.append(rowIndex);
              sb.append(' ');
              sb.append(j + 1);
              sb.append(' ');
              sb.append(lvalue);
              sb.append('\n');
              br.write(sb.toString()); // same as append
              sb.setLength(0);
              entriesWritten = true;
            }
          }
        }
      }

      // handle empty result
      if (!entriesWritten) {
        br.write("1 1 0\n");
      }
    } finally {
      IOUtilFunctions.closeSilently(br);
    }
  }